route.c 135.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <linux/jhash.h>
48
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
58
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
59
#include <net/xfrm.h>
60
#include <net/netevent.h>
61
#include <net/netlink.h>
62
#include <net/nexthop.h>
63
#include <net/lwtunnel.h>
64
#include <net/ip_tunnels.h>
D
David Ahern 已提交
65
#include <net/l3mdev.h>
66
#include <net/ip.h>
67
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
68 69 70 71 72

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

73 74 75 76 77 78 79
static int ip6_rt_type_to_error(u8 fib6_type);

#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS

80
enum rt6_nud_state {
J
Jiri Benc 已提交
81 82 83
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
84 85 86
	RT6_NUD_SUCCEED = 1
};

L
Linus Torvalds 已提交
87
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
90 91 92 93
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
94
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
95 96

static int		ip6_pkt_discard(struct sk_buff *skb);
E
Eric W. Biederman 已提交
97
static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98
static int		ip6_pkt_prohibit(struct sk_buff *skb);
E
Eric W. Biederman 已提交
99
static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
L
Linus Torvalds 已提交
100
static void		ip6_link_failure(struct sk_buff *skb);
101 102 103 104
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
105 106
static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
static size_t rt6_nlmsg_size(struct fib6_info *rt);
107
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108
			 struct fib6_info *rt, struct dst_entry *dst,
109
			 struct in6_addr *dest, struct in6_addr *src,
110 111
			 int iif, int type, u32 portid, u32 seq,
			 unsigned int flags);
112
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 114
					   struct in6_addr *daddr,
					   struct in6_addr *saddr);
L
Linus Torvalds 已提交
115

116
#ifdef CONFIG_IPV6_ROUTE_INFO
117
static struct fib6_info *rt6_add_route_info(struct net *net,
118
					   const struct in6_addr *prefix, int prefixlen,
119 120
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
121
					   unsigned int pref);
122
static struct fib6_info *rt6_get_route_info(struct net *net,
123
					   const struct in6_addr *prefix, int prefixlen,
124 125
					   const struct in6_addr *gwaddr,
					   struct net_device *dev);
126 127
#endif

128 129 130 131 132 133 134
struct uncached_list {
	spinlock_t		lock;
	struct list_head	head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

135
void rt6_uncached_list_add(struct rt6_info *rt)
136 137 138 139 140 141 142 143 144 145
{
	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

	rt->rt6i_uncached_list = ul;

	spin_lock_bh(&ul->lock);
	list_add_tail(&rt->rt6i_uncached, &ul->head);
	spin_unlock_bh(&ul->lock);
}

146
void rt6_uncached_list_del(struct rt6_info *rt)
147 148 149
{
	if (!list_empty(&rt->rt6i_uncached)) {
		struct uncached_list *ul = rt->rt6i_uncached_list;
W
Wei Wang 已提交
150
		struct net *net = dev_net(rt->dst.dev);
151 152 153

		spin_lock_bh(&ul->lock);
		list_del(&rt->rt6i_uncached);
W
Wei Wang 已提交
154
		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 156 157 158 159 160 161 162 163
		spin_unlock_bh(&ul->lock);
	}
}

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
	struct net_device *loopback_dev = net->loopback_dev;
	int cpu;

164 165 166
	if (dev == loopback_dev)
		return;

167 168 169 170 171 172 173 174 175
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
		struct rt6_info *rt;

		spin_lock_bh(&ul->lock);
		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
			struct inet6_dev *rt_idev = rt->rt6i_idev;
			struct net_device *rt_dev = rt->dst.dev;

176
			if (rt_idev->dev == dev) {
177 178 179 180
				rt->rt6i_idev = in6_dev_get(loopback_dev);
				in6_dev_put(rt_idev);
			}

181
			if (rt_dev == dev) {
182 183 184 185 186 187 188 189 190
				rt->dst.dev = loopback_dev;
				dev_hold(rt->dst.dev);
				dev_put(rt_dev);
			}
		}
		spin_unlock_bh(&ul->lock);
	}
}

191
static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 193
					     struct sk_buff *skb,
					     const void *daddr)
194
{
D
David S. Miller 已提交
195
	if (!ipv6_addr_any(p))
196
		return (const void *) p;
197 198
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
199 200 201
	return daddr;
}

202 203 204 205
struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
				   struct net_device *dev,
				   struct sk_buff *skb,
				   const void *daddr)
206
{
207 208
	struct neighbour *n;

209 210
	daddr = choose_neigh_daddr(gw, skb, daddr);
	n = __ipv6_neigh_lookup(dev, daddr);
211 212
	if (n)
		return n;
213 214 215

	n = neigh_create(&nd_tbl, daddr, dev);
	return IS_ERR(n) ? NULL : n;
216 217 218 219 220 221 222 223 224
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
					      struct sk_buff *skb,
					      const void *daddr)
{
	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);

	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
225 226
}

227 228 229 230 231
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
	struct net_device *dev = dst->dev;
	struct rt6_info *rt = (struct rt6_info *)dst;

232
	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
233 234 235 236 237 238 239 240 241
	if (!daddr)
		return;
	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
		return;
	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
		return;
	__ipv6_confirm_neigh(dev, daddr);
}

242
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
243 244 245 246
	.family			=	AF_INET6,
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
247
	.default_advmss		=	ip6_default_advmss,
248
	.mtu			=	ip6_mtu,
249
	.cow_metrics		=	dst_cow_metrics_generic,
L
Linus Torvalds 已提交
250 251 252 253 254
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
255
	.redirect		=	rt6_do_redirect,
256
	.local_out		=	__ip6_local_out,
257
	.neigh_lookup		=	ip6_dst_neigh_lookup,
258
	.confirm_neigh		=	ip6_confirm_neigh,
L
Linus Torvalds 已提交
259 260
};

261
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262
{
263 264 265
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
266 267
}

268 269
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
270 271 272
{
}

273 274
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
275 276 277
{
}

278 279 280 281
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
282
	.mtu			=	ip6_blackhole_mtu,
283
	.default_advmss		=	ip6_default_advmss,
284
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
285
	.redirect		=	ip6_rt_blackhole_redirect,
286
	.cow_metrics		=	dst_cow_metrics_generic,
287
	.neigh_lookup		=	ip6_dst_neigh_lookup,
288 289
};

290
static const u32 ip6_template_metrics[RTAX_MAX] = {
L
Li RongQing 已提交
291
	[RTAX_HOPLIMIT - 1] = 0,
292 293
};

294
static const struct fib6_info fib6_null_entry_template = {
295 296 297 298
	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
	.fib6_protocol  = RTPROT_KERNEL,
	.fib6_metric	= ~(u32)0,
	.fib6_ref	= ATOMIC_INIT(1),
D
David Ahern 已提交
299 300 301 302
	.fib6_type	= RTN_UNREACHABLE,
	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
};

303
static const struct rt6_info ip6_null_entry_template = {
304 305 306
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
307
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
308 309 310
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
311 312 313 314
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

T
Thomas Graf 已提交
315 316
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

317
static const struct rt6_info ip6_prohibit_entry_template = {
318 319 320
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
321
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
322 323 324
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
325 326 327 328
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

329
static const struct rt6_info ip6_blk_hole_entry_template = {
330 331 332
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
333
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
334 335
		.error		= -EINVAL,
		.input		= dst_discard,
E
Eric W. Biederman 已提交
336
		.output		= dst_discard_out,
T
Thomas Graf 已提交
337 338 339 340 341 342
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

343 344 345 346 347 348 349 350
static void rt6_info_init(struct rt6_info *rt)
{
	struct dst_entry *dst = &rt->dst;

	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
	INIT_LIST_HEAD(&rt->rt6i_uncached);
}

L
Linus Torvalds 已提交
351
/* allocate dst with ip6_dst_ops */
352 353
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
			       int flags)
L
Linus Torvalds 已提交
354
{
355
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
W
Wei Wang 已提交
356
					1, DST_OBSOLETE_FORCE_CHK, flags);
357

W
Wei Wang 已提交
358
	if (rt) {
359
		rt6_info_init(rt);
W
Wei Wang 已提交
360 361
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
	}
362

363
	return rt;
L
Linus Torvalds 已提交
364
}
365
EXPORT_SYMBOL(ip6_dst_alloc);
M
Martin KaFai Lau 已提交
366

L
Linus Torvalds 已提交
367 368 369
static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
370
	struct fib6_info *from;
371
	struct inet6_dev *idev;
L
Linus Torvalds 已提交
372

373
	ip_dst_metrics_put(dst);
374 375 376
	rt6_uncached_list_del(rt);

	idev = rt->rt6i_idev;
377
	if (idev) {
L
Linus Torvalds 已提交
378 379
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
380
	}
381

382 383 384
	rcu_read_lock();
	from = rcu_dereference(rt->from);
	rcu_assign_pointer(rt->from, NULL);
385
	fib6_info_release(from);
386
	rcu_read_unlock();
387 388
}

L
Linus Torvalds 已提交
389 390 391 392 393
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
394
	struct net_device *loopback_dev =
395
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
396

397 398 399 400 401
	if (idev && idev->dev != loopback_dev) {
		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
		if (loopback_idev) {
			rt->rt6i_idev = loopback_idev;
			in6_dev_put(idev);
402
		}
L
Linus Torvalds 已提交
403 404 405
	}
}

406 407 408 409 410 411 412 413
static bool __rt6_check_expired(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_EXPIRES)
		return time_after(jiffies, rt->dst.expires);
	else
		return false;
}

414
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
415
{
416 417 418 419
	struct fib6_info *from;

	from = rcu_dereference(rt->from);

420 421
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
422
			return true;
423
	} else if (from) {
424
		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
425
			fib6_check_expired(from);
426
	}
427
	return false;
L
Linus Torvalds 已提交
428 429
}

430 431 432 433 434
struct fib6_info *fib6_multipath_select(const struct net *net,
					struct fib6_info *match,
					struct flowi6 *fl6, int oif,
					const struct sk_buff *skb,
					int strict)
435
{
436
	struct fib6_info *sibling, *next_sibling;
437

438 439 440 441
	/* We might have already computed the hash for ICMPv6 errors. In such
	 * case it will always be non-zero. Otherwise now is the time to do it.
	 */
	if (!fl6->mp_hash)
442
		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
443

D
David Ahern 已提交
444
	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.fib_nh_upper_bound))
445 446
		return match;

447 448
	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
				 fib6_siblings) {
449 450
		int nh_upper_bound;

D
David Ahern 已提交
451
		nh_upper_bound = atomic_read(&sibling->fib6_nh.fib_nh_upper_bound);
452
		if (fl6->mp_hash > nh_upper_bound)
453 454 455 456 457 458 459
			continue;
		if (rt6_score_route(sibling, oif, strict) < 0)
			break;
		match = sibling;
		break;
	}

460 461 462
	return match;
}

L
Linus Torvalds 已提交
463
/*
464
 *	Route lookup. rcu_read_lock() should be held.
L
Linus Torvalds 已提交
465 466
 */

467 468
static inline struct fib6_info *rt6_device_match(struct net *net,
						 struct fib6_info *rt,
469
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
470
						    int oif,
471
						    int flags)
L
Linus Torvalds 已提交
472
{
473
	struct fib6_info *sprt;
L
Linus Torvalds 已提交
474

475
	if (!oif && ipv6_addr_any(saddr) &&
D
David Ahern 已提交
476
	    !(rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD))
477
		return rt;
478

479
	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
D
David Ahern 已提交
480
		const struct net_device *dev = sprt->fib6_nh.fib_nh_dev;
481

D
David Ahern 已提交
482
		if (sprt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
483 484
			continue;

485
		if (oif) {
L
Linus Torvalds 已提交
486 487
			if (dev->ifindex == oif)
				return sprt;
488 489 490 491
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
492
		}
493
	}
L
Linus Torvalds 已提交
494

495 496
	if (oif && flags & RT6_LOOKUP_F_IFACE)
		return net->ipv6.fib6_null_entry;
497

D
David Ahern 已提交
498
	return rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
L
Linus Torvalds 已提交
499 500
}

501
#ifdef CONFIG_IPV6_ROUTER_PREF
502 503 504 505 506 507 508 509 510 511 512 513 514
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
515
	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
516
	dev_put(work->dev);
517
	kfree(work);
518 519
}

520
static void rt6_probe(struct fib6_info *rt)
521
{
522
	struct __rt6_probe_work *work = NULL;
523
	const struct in6_addr *nh_gw;
524
	struct neighbour *neigh;
525
	struct net_device *dev;
526
	struct inet6_dev *idev;
527

528 529 530 531 532 533 534 535
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
536
	if (!rt || !rt->fib6_nh.fib_nh_has_gw)
537
		return;
538

D
David Ahern 已提交
539 540
	nh_gw = &rt->fib6_nh.fib_nh_gw6;
	dev = rt->fib6_nh.fib_nh_dev;
541
	rcu_read_lock_bh();
542
	idev = __in6_dev_get(dev);
543
	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
544
	if (neigh) {
545 546 547
		if (neigh->nud_state & NUD_VALID)
			goto out;

548
		write_lock(&neigh->lock);
549 550
		if (!(neigh->nud_state & NUD_VALID) &&
		    time_after(jiffies,
D
David Ahern 已提交
551
			       neigh->updated + idev->cnf.rtr_probe_interval)) {
552 553 554
			work = kmalloc(sizeof(*work), GFP_ATOMIC);
			if (work)
				__neigh_set_probe_once(neigh);
555
		}
556
		write_unlock(&neigh->lock);
557 558
	} else if (time_after(jiffies, rt->last_probe +
				       idev->cnf.rtr_probe_interval)) {
559
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
560
	}
561 562

	if (work) {
563
		rt->last_probe = jiffies;
564
		INIT_WORK(&work->work, rt6_probe_deferred);
565 566 567
		work->target = *nh_gw;
		dev_hold(dev);
		work->dev = dev;
568 569 570
		schedule_work(&work->work);
	}

571
out:
572
	rcu_read_unlock_bh();
573 574
}
#else
575
static inline void rt6_probe(struct fib6_info *rt)
576 577 578 579
{
}
#endif

L
Linus Torvalds 已提交
580
/*
581
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
582
 */
583
static inline int rt6_check_dev(struct fib6_info *rt, int oif)
584
{
D
David Ahern 已提交
585
	const struct net_device *dev = rt->fib6_nh.fib_nh_dev;
586

587
	if (!oif || dev->ifindex == oif)
588
		return 2;
589
	return 0;
590
}
L
Linus Torvalds 已提交
591

592
static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
L
Linus Torvalds 已提交
593
{
594
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
595
	struct neighbour *neigh;
596

597
	if (rt->fib6_flags & RTF_NONEXTHOP ||
598
	    !rt->fib6_nh.fib_nh_has_gw)
599
		return RT6_NUD_SUCCEED;
600 601

	rcu_read_lock_bh();
D
David Ahern 已提交
602 603
	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.fib_nh_dev,
					  &rt->fib6_nh.fib_nh_gw6);
604 605
	if (neigh) {
		read_lock(&neigh->lock);
606
		if (neigh->nud_state & NUD_VALID)
607
			ret = RT6_NUD_SUCCEED;
608
#ifdef CONFIG_IPV6_ROUTER_PREF
609
		else if (!(neigh->nud_state & NUD_FAILED))
610
			ret = RT6_NUD_SUCCEED;
J
Jiri Benc 已提交
611 612
		else
			ret = RT6_NUD_FAIL_PROBE;
613
#endif
614
		read_unlock(&neigh->lock);
615 616
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
J
Jiri Benc 已提交
617
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
618
	}
619 620
	rcu_read_unlock_bh();

621
	return ret;
L
Linus Torvalds 已提交
622 623
}

624
static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
L
Linus Torvalds 已提交
625
{
626
	int m;
627

628
	m = rt6_check_dev(rt, oif);
629
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
630
		return RT6_NUD_FAIL_HARD;
631
#ifdef CONFIG_IPV6_ROUTER_PREF
632
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
633
#endif
634 635 636 637 638
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
639 640 641
	return m;
}

642 643
static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
				   int *mpri, struct fib6_info *match,
644
				   bool *do_rr)
645
{
646
	int m;
647
	bool match_do_rr = false;
648

D
David Ahern 已提交
649
	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
650 651
		goto out;

D
David Ahern 已提交
652 653
	if (ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev) &&
	    rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
654
	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
655
		goto out;
656

657
	if (fib6_check_expired(rt))
658 659 660
		goto out;

	m = rt6_score_route(rt, oif, strict);
J
Jiri Benc 已提交
661
	if (m == RT6_NUD_FAIL_DO_RR) {
662 663
		match_do_rr = true;
		m = 0; /* lowest valid score */
J
Jiri Benc 已提交
664
	} else if (m == RT6_NUD_FAIL_HARD) {
665
		goto out;
666 667 668 669
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
670

J
Jiri Benc 已提交
671
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
672
	if (m > *mpri) {
673
		*do_rr = match_do_rr;
674 675 676 677 678 679 680
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

681 682 683
static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
				     struct fib6_info *leaf,
				     struct fib6_info *rr_head,
684 685
				     u32 metric, int oif, int strict,
				     bool *do_rr)
686
{
687
	struct fib6_info *rt, *match, *cont;
688
	int mpri = -1;
L
Linus Torvalds 已提交
689

690
	match = NULL;
691
	cont = NULL;
692
	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
693
		if (rt->fib6_metric != metric) {
694 695 696 697 698 699 700
			cont = rt;
			break;
		}

		match = find_match(rt, oif, strict, &mpri, match, do_rr);
	}

701
	for (rt = leaf; rt && rt != rr_head;
702
	     rt = rcu_dereference(rt->fib6_next)) {
703
		if (rt->fib6_metric != metric) {
704 705 706 707
			cont = rt;
			break;
		}

708
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
709 710 711 712 713
	}

	if (match || !cont)
		return match;

714
	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
715
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
L
Linus Torvalds 已提交
716

717 718
	return match;
}
L
Linus Torvalds 已提交
719

720
static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
W
Wei Wang 已提交
721
				   int oif, int strict)
722
{
723 724
	struct fib6_info *leaf = rcu_dereference(fn->leaf);
	struct fib6_info *match, *rt0;
725
	bool do_rr = false;
726
	int key_plen;
L
Linus Torvalds 已提交
727

D
David Ahern 已提交
728 729
	if (!leaf || leaf == net->ipv6.fib6_null_entry)
		return net->ipv6.fib6_null_entry;
W
Wei Wang 已提交
730

731
	rt0 = rcu_dereference(fn->rr_ptr);
732
	if (!rt0)
733
		rt0 = leaf;
L
Linus Torvalds 已提交
734

735 736 737 738 739
	/* Double check to make sure fn is not an intermediate node
	 * and fn->leaf does not points to its child's leaf
	 * (This might happen if all routes under fn are deleted from
	 * the tree and fib6_repair_tree() is called on the node.)
	 */
740
	key_plen = rt0->fib6_dst.plen;
741
#ifdef CONFIG_IPV6_SUBTREES
742 743
	if (rt0->fib6_src.plen)
		key_plen = rt0->fib6_src.plen;
744 745
#endif
	if (fn->fn_bit != key_plen)
D
David Ahern 已提交
746
		return net->ipv6.fib6_null_entry;
747

748
	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
749
			     &do_rr);
L
Linus Torvalds 已提交
750

751
	if (do_rr) {
752
		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
753

754
		/* no entries matched; do round-robin */
755
		if (!next || next->fib6_metric != rt0->fib6_metric)
W
Wei Wang 已提交
756
			next = leaf;
757

758
		if (next != rt0) {
759
			spin_lock_bh(&leaf->fib6_table->tb6_lock);
760
			/* make sure next is not being deleted from the tree */
761
			if (next->fib6_node)
762
				rcu_assign_pointer(fn->rr_ptr, next);
763
			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
764
		}
L
Linus Torvalds 已提交
765 766
	}

D
David Ahern 已提交
767
	return match ? match : net->ipv6.fib6_null_entry;
L
Linus Torvalds 已提交
768 769
}

770
static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
771
{
772
	return (rt->fib6_flags & RTF_NONEXTHOP) || rt->fib6_nh.fib_nh_has_gw;
773 774
}

775 776
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
777
		  const struct in6_addr *gwaddr)
778
{
779
	struct net *net = dev_net(dev);
780 781 782
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
783
	unsigned long lifetime;
784
	struct fib6_info *rt;
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
807
		return -EINVAL;
808

809
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
810 811 812 813 814 815 816 817 818 819 820

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

821
	if (rinfo->prefix_len == 0)
822
		rt = rt6_get_dflt_router(net, gwaddr, dev);
823 824
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
825
					gwaddr, dev);
826 827

	if (rt && !lifetime) {
828
		ip6_del_rt(net, rt);
829 830 831 832
		rt = NULL;
	}

	if (!rt && lifetime)
833 834
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
					dev, pref);
835
	else if (rt)
836 837
		rt->fib6_flags = RTF_ROUTEINFO |
				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
838 839

	if (rt) {
840
		if (!addrconf_finite_timeout(lifetime))
841
			fib6_clean_expires(rt);
842
		else
843
			fib6_set_expires(rt, jiffies + HZ * lifetime);
844

845
		fib6_info_release(rt);
846 847 848 849 850
	}
	return 0;
}
#endif

851 852 853 854 855
/*
 *	Misc support functions
 */

/* called with rcu_lock held */
856
static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
857
{
D
David Ahern 已提交
858
	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
859

860
	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
861 862 863 864 865
		/* for copies of local routes, dst->dev needs to be the
		 * device if it is a master device, the master device if
		 * device is enslaved, and the loopback as the default
		 */
		if (netif_is_l3_slave(dev) &&
866
		    !rt6_need_strict(&rt->fib6_dst.addr))
867 868 869 870 871 872 873 874 875 876 877
			dev = l3mdev_master_dev_rcu(dev);
		else if (!netif_is_l3_master(dev))
			dev = dev_net(dev)->loopback_dev;
		/* last case is netif_is_l3_master(dev) is true in which
		 * case we want dev returned to be dev
		 */
	}

	return dev;
}

878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897
static const int fib6_prop[RTN_MAX + 1] = {
	[RTN_UNSPEC]	= 0,
	[RTN_UNICAST]	= 0,
	[RTN_LOCAL]	= 0,
	[RTN_BROADCAST]	= 0,
	[RTN_ANYCAST]	= 0,
	[RTN_MULTICAST]	= 0,
	[RTN_BLACKHOLE]	= -EINVAL,
	[RTN_UNREACHABLE] = -EHOSTUNREACH,
	[RTN_PROHIBIT]	= -EACCES,
	[RTN_THROW]	= -EAGAIN,
	[RTN_NAT]	= -EINVAL,
	[RTN_XRESOLVE]	= -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
	return fib6_prop[fib6_type];
}

898
static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
899 900 901 902 903 904 905 906 907 908 909 910 911
{
	unsigned short flags = 0;

	if (rt->dst_nocount)
		flags |= DST_NOCOUNT;
	if (rt->dst_nopolicy)
		flags |= DST_NOPOLICY;
	if (rt->dst_host)
		flags |= DST_HOST;

	return flags;
}

912
static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
{
	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);

	switch (ort->fib6_type) {
	case RTN_BLACKHOLE:
		rt->dst.output = dst_discard_out;
		rt->dst.input = dst_discard;
		break;
	case RTN_PROHIBIT:
		rt->dst.output = ip6_pkt_prohibit_out;
		rt->dst.input = ip6_pkt_prohibit;
		break;
	case RTN_THROW:
	case RTN_UNREACHABLE:
	default:
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		break;
	}
}

934
static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
935
{
936
	if (ort->fib6_flags & RTF_REJECT) {
937 938 939 940 941 942 943
		ip6_rt_init_dst_reject(rt, ort);
		return;
	}

	rt->dst.error = 0;
	rt->dst.output = ip6_output;

944
	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
945
		rt->dst.input = ip6_input;
946
	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
947 948 949 950 951
		rt->dst.input = ip6_mc_input;
	} else {
		rt->dst.input = ip6_forward;
	}

D
David Ahern 已提交
952 953
	if (ort->fib6_nh.fib_nh_lws) {
		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.fib_nh_lws);
954 955 956 957 958 959
		lwtunnel_set_redirect(&rt->dst);
	}

	rt->dst.lastuse = jiffies;
}

960
/* Caller must already hold reference to @from */
961
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
962 963
{
	rt->rt6i_flags &= ~RTF_EXPIRES;
964
	rcu_assign_pointer(rt->from, from);
965
	ip_dst_init_metrics(&rt->dst, from->fib6_metrics);
966 967
}

968
/* Caller must already hold reference to @ort */
969
static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
970
{
D
David Ahern 已提交
971 972
	struct net_device *dev = fib6_info_nh_dev(ort);

973 974
	ip6_rt_init_dst(rt, ort);

975
	rt->rt6i_dst = ort->fib6_dst;
D
David Ahern 已提交
976
	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
977
	rt->rt6i_flags = ort->fib6_flags;
978
	if (ort->fib6_nh.fib_nh_has_gw) {
D
David Ahern 已提交
979
		rt->rt6i_gateway = ort->fib6_nh.fib_nh_gw6;
980 981
		rt->rt6i_flags |= RTF_GATEWAY;
	}
982 983
	rt6_set_from(rt, ort);
#ifdef CONFIG_IPV6_SUBTREES
984
	rt->rt6i_src = ort->fib6_src;
985 986 987
#endif
}

M
Martin KaFai Lau 已提交
988 989 990
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
					struct in6_addr *saddr)
{
991
	struct fib6_node *pn, *sn;
M
Martin KaFai Lau 已提交
992 993 994
	while (1) {
		if (fn->fn_flags & RTN_TL_ROOT)
			return NULL;
995 996 997
		pn = rcu_dereference(fn->parent);
		sn = FIB6_SUBTREE(pn);
		if (sn && sn != fn)
998
			fn = fib6_node_lookup(sn, NULL, saddr);
M
Martin KaFai Lau 已提交
999 1000 1001 1002 1003 1004
		else
			fn = pn;
		if (fn->fn_flags & RTN_RTINFO)
			return fn;
	}
}
T
Thomas Graf 已提交
1005

1006
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt)
1007 1008 1009 1010 1011
{
	struct rt6_info *rt = *prt;

	if (dst_hold_safe(&rt->dst))
		return true;
1012
	if (net) {
1013 1014 1015 1016 1017 1018 1019 1020 1021
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
	} else {
		rt = NULL;
	}
	*prt = rt;
	return false;
}

1022
/* called with rcu_lock held */
1023
static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1024
{
1025
	unsigned short flags = fib6_info_dst_flags(rt);
D
David Ahern 已提交
1026
	struct net_device *dev = rt->fib6_nh.fib_nh_dev;
1027 1028
	struct rt6_info *nrt;

1029
	if (!fib6_info_hold_safe(rt))
1030
		goto fallback;
1031

1032
	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1033
	if (!nrt) {
1034
		fib6_info_release(rt);
1035 1036
		goto fallback;
	}
1037

1038 1039 1040 1041 1042 1043
	ip6_rt_copy_init(nrt, rt);
	return nrt;

fallback:
	nrt = dev_net(dev)->ipv6.ip6_null_entry;
	dst_hold(&nrt->dst);
1044 1045 1046
	return nrt;
}

1047 1048
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
D
David Ahern 已提交
1049 1050 1051
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
1052
{
1053
	struct fib6_info *f6i;
L
Linus Torvalds 已提交
1054
	struct fib6_node *fn;
1055
	struct rt6_info *rt;
L
Linus Torvalds 已提交
1056

1057 1058 1059
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		flags &= ~RT6_LOOKUP_F_IFACE;

1060
	rcu_read_lock();
1061
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
1062
restart:
1063 1064 1065
	f6i = rcu_dereference(fn->leaf);
	if (!f6i) {
		f6i = net->ipv6.fib6_null_entry;
1066
	} else {
1067
		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1068
				      fl6->flowi6_oif, flags);
1069
		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1070 1071 1072
			f6i = fib6_multipath_select(net, f6i, fl6,
						    fl6->flowi6_oif, skb,
						    flags);
1073
	}
1074
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1075 1076 1077 1078
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
	}
1079

1080
	trace_fib6_table_lookup(net, f6i, table, fl6);
1081

1082
	/* Search through exception table */
1083 1084
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1085
		if (ip6_hold_safe(net, &rt))
1086
			dst_use_noref(&rt->dst, jiffies);
1087
	} else if (f6i == net->ipv6.fib6_null_entry) {
1088 1089
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
1090 1091
	} else {
		rt = ip6_create_rt_rcu(f6i);
1092
	}
D
David Ahern 已提交
1093

1094
	rcu_read_unlock();
D
David Ahern 已提交
1095

T
Thomas Graf 已提交
1096 1097 1098
	return rt;
}

1099
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
D
David Ahern 已提交
1100
				   const struct sk_buff *skb, int flags)
F
Florian Westphal 已提交
1101
{
D
David Ahern 已提交
1102
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
F
Florian Westphal 已提交
1103 1104 1105
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

1106
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
D
David Ahern 已提交
1107 1108
			    const struct in6_addr *saddr, int oif,
			    const struct sk_buff *skb, int strict)
T
Thomas Graf 已提交
1109
{
1110 1111 1112
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
1113 1114
	};
	struct dst_entry *dst;
1115
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
1116

1117
	if (saddr) {
1118
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1119 1120 1121
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

D
David Ahern 已提交
1122
	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
1123 1124 1125 1126 1127
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
1128 1129
	return NULL;
}
1130 1131
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
1132
/* ip6_ins_rt is called with FREE table->tb6_lock.
1133 1134 1135
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
L
Linus Torvalds 已提交
1136 1137
 */

1138
static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1139
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1140 1141
{
	int err;
T
Thomas Graf 已提交
1142
	struct fib6_table *table;
L
Linus Torvalds 已提交
1143

1144
	table = rt->fib6_table;
1145
	spin_lock_bh(&table->tb6_lock);
1146
	err = fib6_add(&table->tb6_root, rt, info, extack);
1147
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1148 1149 1150 1151

	return err;
}

1152
int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1153
{
1154
	struct nl_info info = {	.nl_net = net, };
1155

1156
	return __ip6_ins_rt(rt, &info, NULL);
1157 1158
}

1159
static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1160 1161
					   const struct in6_addr *daddr,
					   const struct in6_addr *saddr)
L
Linus Torvalds 已提交
1162
{
1163
	struct net_device *dev;
L
Linus Torvalds 已提交
1164 1165 1166 1167 1168 1169
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

1170 1171 1172
	if (!fib6_info_hold_safe(ort))
		return NULL;

1173
	dev = ip6_rt_get_dev_rcu(ort);
1174
	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1175 1176
	if (!rt) {
		fib6_info_release(ort);
M
Martin KaFai Lau 已提交
1177
		return NULL;
1178
	}
M
Martin KaFai Lau 已提交
1179 1180 1181 1182 1183 1184

	ip6_rt_copy_init(rt, ort);
	rt->rt6i_flags |= RTF_CACHE;
	rt->dst.flags |= DST_HOST;
	rt->rt6i_dst.addr = *daddr;
	rt->rt6i_dst.plen = 128;
L
Linus Torvalds 已提交
1185

M
Martin KaFai Lau 已提交
1186
	if (!rt6_is_gw_or_nonexthop(ort)) {
1187 1188
		if (ort->fib6_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
M
Martin KaFai Lau 已提交
1189
			rt->rt6i_flags |= RTF_ANYCAST;
L
Linus Torvalds 已提交
1190
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
1191 1192 1193
		if (rt->rt6i_src.plen && saddr) {
			rt->rt6i_src.addr = *saddr;
			rt->rt6i_src.plen = 128;
1194
		}
M
Martin KaFai Lau 已提交
1195
#endif
1196
	}
L
Linus Torvalds 已提交
1197

1198 1199
	return rt;
}
L
Linus Torvalds 已提交
1200

1201
static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1202
{
1203
	unsigned short flags = fib6_info_dst_flags(rt);
1204
	struct net_device *dev;
M
Martin KaFai Lau 已提交
1205 1206
	struct rt6_info *pcpu_rt;

1207 1208 1209
	if (!fib6_info_hold_safe(rt))
		return NULL;

1210 1211
	rcu_read_lock();
	dev = ip6_rt_get_dev_rcu(rt);
1212
	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1213
	rcu_read_unlock();
1214 1215
	if (!pcpu_rt) {
		fib6_info_release(rt);
M
Martin KaFai Lau 已提交
1216
		return NULL;
1217
	}
M
Martin KaFai Lau 已提交
1218 1219 1220 1221 1222
	ip6_rt_copy_init(pcpu_rt, rt);
	pcpu_rt->rt6i_flags |= RTF_PCPU;
	return pcpu_rt;
}

1223
/* It should be called with rcu_read_lock() acquired */
1224
static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1225
{
1226
	struct rt6_info *pcpu_rt, **p;
M
Martin KaFai Lau 已提交
1227 1228 1229 1230

	p = this_cpu_ptr(rt->rt6i_pcpu);
	pcpu_rt = *p;

1231
	if (pcpu_rt)
1232
		ip6_hold_safe(NULL, &pcpu_rt);
1233

1234 1235 1236
	return pcpu_rt;
}

1237
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1238
					    struct fib6_info *rt)
1239 1240
{
	struct rt6_info *pcpu_rt, *prev, **p;
M
Martin KaFai Lau 已提交
1241 1242 1243

	pcpu_rt = ip6_rt_pcpu_alloc(rt);
	if (!pcpu_rt) {
1244 1245
		dst_hold(&net->ipv6.ip6_null_entry->dst);
		return net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1246 1247
	}

1248 1249 1250
	dst_hold(&pcpu_rt->dst);
	p = this_cpu_ptr(rt->rt6i_pcpu);
	prev = cmpxchg(p, NULL, pcpu_rt);
1251
	BUG_ON(prev);
1252

M
Martin KaFai Lau 已提交
1253 1254 1255
	return pcpu_rt;
}

1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
				 struct rt6_exception *rt6_ex)
{
1266
	struct fib6_info *from;
1267
	struct net *net;
W
Wei Wang 已提交
1268

1269 1270
	if (!bucket || !rt6_ex)
		return;
1271 1272

	net = dev_net(rt6_ex->rt6i->dst.dev);
1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
	net->ipv6.rt6_stats->fib_rt_cache--;

	/* purge completely the exception to allow releasing the held resources:
	 * some [sk] cache may keep the dst around for unlimited time
	 */
	from = rcu_dereference_protected(rt6_ex->rt6i->from,
					 lockdep_is_held(&rt6_exception_lock));
	rcu_assign_pointer(rt6_ex->rt6i->from, NULL);
	fib6_info_release(from);
	dst_dev_put(&rt6_ex->rt6i->dst);

1284
	hlist_del_rcu(&rt6_ex->hlist);
1285
	dst_release(&rt6_ex->rt6i->dst);
1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391
	kfree_rcu(rt6_ex, rcu);
	WARN_ON_ONCE(!bucket->depth);
	bucket->depth--;
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
	struct rt6_exception *rt6_ex, *oldest = NULL;

	if (!bucket)
		return;

	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
			oldest = rt6_ex;
	}
	rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
			      const struct in6_addr *src)
{
	static u32 seed __read_mostly;
	u32 val;

	net_get_random_once(&seed, sizeof(seed));
	val = jhash(dst, sizeof(*dst), seed);

#ifdef CONFIG_IPV6_SUBTREES
	if (src)
		val = jhash(src, sizeof(*src), val);
#endif
	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
			      const struct in6_addr *daddr,
			      const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
			 const struct in6_addr *daddr,
			 const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	WARN_ON_ONCE(!rcu_read_lock_held());

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

1392
static unsigned int fib6_mtu(const struct fib6_info *rt)
1393 1394 1395
{
	unsigned int mtu;

D
David Ahern 已提交
1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407
	if (rt->fib6_pmtu) {
		mtu = rt->fib6_pmtu;
	} else {
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev;

		rcu_read_lock();
		idev = __in6_dev_get(dev);
		mtu = idev->cnf.mtu6;
		rcu_read_unlock();
	}

1408 1409
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

D
David Ahern 已提交
1410
	return mtu - lwtunnel_headroom(rt->fib6_nh.fib_nh_lws, mtu);
1411 1412
}

1413
static int rt6_insert_exception(struct rt6_info *nrt,
1414
				struct fib6_info *ort)
1415
{
1416
	struct net *net = dev_net(nrt->dst.dev);
1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	int err = 0;

	spin_lock_bh(&rt6_exception_lock);

	if (ort->exception_bucket_flushed) {
		err = -EINVAL;
		goto out;
	}

	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));
	if (!bucket) {
		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
				 GFP_ATOMIC);
		if (!bucket) {
			err = -ENOMEM;
			goto out;
		}
		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
	}

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates ort is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1448
	if (ort->fib6_src.plen)
1449 1450
		src_key = &nrt->rt6i_src.addr;
#endif
1451 1452 1453 1454
	/* rt6_mtu_change() might lower mtu on ort.
	 * Only insert this exception route if its mtu
	 * is less than ort's mtu value.
	 */
1455
	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1456 1457 1458
		err = -EINVAL;
		goto out;
	}
1459

1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex)
		rt6_remove_exception(bucket, rt6_ex);

	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
	if (!rt6_ex) {
		err = -ENOMEM;
		goto out;
	}
	rt6_ex->rt6i = nrt;
	rt6_ex->stamp = jiffies;
	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
	bucket->depth++;
W
Wei Wang 已提交
1474
	net->ipv6.rt6_stats->fib_rt_cache++;
1475 1476 1477 1478 1479 1480 1481 1482

	if (bucket->depth > FIB6_MAX_DEPTH)
		rt6_exception_remove_oldest(bucket);

out:
	spin_unlock_bh(&rt6_exception_lock);

	/* Update fn->fn_sernum to invalidate all cached dst */
1483
	if (!err) {
1484
		spin_lock_bh(&ort->fib6_table->tb6_lock);
1485
		fib6_update_sernum(net, ort);
1486
		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1487 1488
		fib6_force_start_gc(net);
	}
1489 1490 1491 1492

	return err;
}

1493
void rt6_flush_exceptions(struct fib6_info *rt)
1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	spin_lock_bh(&rt6_exception_lock);
	/* Prevent rt6_insert_exception() to recreate the bucket list */
	rt->exception_bucket_flushed = 1;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
	if (!bucket)
		goto out;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
			rt6_remove_exception(bucket, rt6_ex);
		WARN_ON_ONCE(bucket->depth);
		bucket++;
	}

out:
	spin_unlock_bh(&rt6_exception_lock);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
1523
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540
					   struct in6_addr *daddr,
					   struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	struct rt6_info *res = NULL;

	bucket = rcu_dereference(rt->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates rt is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1541
	if (rt->fib6_src.plen)
1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552
		src_key = saddr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		res = rt6_ex->rt6i;

	return res;
}

/* Remove the passed in cached rt from the hash table that contains it */
1553
static int rt6_remove_exception_rt(struct rt6_info *rt)
1554 1555 1556 1557
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
1558
	struct fib6_info *from;
1559 1560
	int err;

1561
	from = rcu_dereference(rt->from);
1562
	if (!from ||
1563
	    !(rt->rt6i_flags & RTF_CACHE))
1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
		return -EINVAL;

	if (!rcu_access_pointer(from->rt6i_exception_bucket))
		return -ENOENT;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1579
	if (from->fib6_src.plen)
1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_spinlock(&bucket,
					       &rt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex) {
		rt6_remove_exception(bucket, rt6_ex);
		err = 0;
	} else {
		err = -ENOENT;
	}

	spin_unlock_bh(&rt6_exception_lock);
	return err;
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
1604
	struct fib6_info *from;
1605 1606

	rcu_read_lock();
1607 1608 1609 1610
	from = rcu_dereference(rt->from);
	if (!from || !(rt->rt6i_flags & RTF_CACHE))
		goto unlock;

1611 1612 1613 1614 1615 1616 1617 1618 1619
	bucket = rcu_dereference(from->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1620
	if (from->fib6_src.plen)
1621 1622 1623 1624 1625 1626 1627 1628
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket,
					  &rt->rt6i_dst.addr,
					  src_key);
	if (rt6_ex)
		rt6_ex->stamp = jiffies;

1629
unlock:
1630 1631 1632
	rcu_read_unlock();
}

1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
					 struct rt6_info *rt, int mtu)
{
	/* If the new MTU is lower than the route PMTU, this new MTU will be the
	 * lowest MTU in the path: always allow updating the route PMTU to
	 * reflect PMTU decreases.
	 *
	 * If the new MTU is higher, and the route PMTU is equal to the local
	 * MTU, this means the old MTU is the lowest in the path, so allow
	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
	 * handle this.
	 */

	if (dst_mtu(&rt->dst) >= mtu)
		return true;

	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
		return true;

	return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1656
				       struct fib6_info *rt, int mtu)
1657 1658 1659 1660 1661 1662 1663 1664
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

1665 1666 1667 1668 1669 1670 1671 1672
	if (!bucket)
		return;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
			struct rt6_info *entry = rt6_ex->rt6i;

			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1673
			 * route), the metrics of its rt->from have already
1674 1675
			 * been updated.
			 */
1676
			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1677
			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1678
				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1679
		}
1680
		bucket++;
1681 1682 1683
	}
}

1684 1685
#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)

1686
static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720
					struct in6_addr *gateway)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				     lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				struct rt6_info *entry = rt6_ex->rt6i;

				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
				    RTF_CACHE_GATEWAY &&
				    ipv6_addr_equal(gateway,
						    &entry->rt6i_gateway)) {
					rt6_remove_exception(bucket, rt6_ex);
				}
			}
			bucket++;
		}
	}

	spin_unlock_bh(&rt6_exception_lock);
}

1721 1722 1723 1724 1725 1726 1727
static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
				      struct rt6_exception *rt6_ex,
				      struct fib6_gc_args *gc_args,
				      unsigned long now)
{
	struct rt6_info *rt = rt6_ex->rt6i;

1728 1729 1730 1731 1732 1733
	/* we are pruning and obsoleting aged-out and non gateway exceptions
	 * even if others have still references to them, so that on next
	 * dst_check() such references can be dropped.
	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
	 * expired, independently from their aging, as per RFC 8201 section 4
	 */
W
Wei Wang 已提交
1734 1735 1736 1737 1738 1739 1740 1741
	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	} else if (time_after(jiffies, rt->dst.expires)) {
		RT6_TRACE("purging expired route %p\n", rt);
1742 1743
		rt6_remove_exception(bucket, rt6_ex);
		return;
W
Wei Wang 已提交
1744 1745 1746
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
1747 1748 1749
		struct neighbour *neigh;
		__u8 neigh_flags = 0;

1750 1751
		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
		if (neigh)
1752
			neigh_flags = neigh->flags;
1753

1754 1755 1756 1757 1758 1759 1760
		if (!(neigh_flags & NTF_ROUTER)) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	}
W
Wei Wang 已提交
1761

1762 1763 1764
	gc_args->more++;
}

1765
void rt6_age_exceptions(struct fib6_info *rt,
1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776
			struct fib6_gc_args *gc_args,
			unsigned long now)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

1777 1778
	rcu_read_lock_bh();
	spin_lock(&rt6_exception_lock);
1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				rt6_age_examine_exception(bucket, rt6_ex,
							  gc_args, now);
			}
			bucket++;
		}
	}
1792 1793
	spin_unlock(&rt6_exception_lock);
	rcu_read_unlock_bh();
1794 1795
}

1796 1797 1798
/* must be called with rcu lock held */
struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
				    int oif, struct flowi6 *fl6, int strict)
L
Linus Torvalds 已提交
1799
{
1800
	struct fib6_node *fn, *saved_fn;
1801
	struct fib6_info *f6i;
L
Linus Torvalds 已提交
1802

1803
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1804
	saved_fn = fn;
L
Linus Torvalds 已提交
1805

D
David Ahern 已提交
1806 1807 1808
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		oif = 0;

M
Martin KaFai Lau 已提交
1809
redo_rt6_select:
1810 1811
	f6i = rt6_select(net, fn, oif, strict);
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1812 1813 1814
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto redo_rt6_select;
1815 1816 1817 1818 1819 1820
		else if (strict & RT6_LOOKUP_F_REACHABLE) {
			/* also consider unreachable route */
			strict &= ~RT6_LOOKUP_F_REACHABLE;
			fn = saved_fn;
			goto redo_rt6_select;
		}
M
Martin KaFai Lau 已提交
1821 1822
	}

1823
	trace_fib6_table_lookup(net, f6i, table, fl6);
1824

1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846
	return f6i;
}

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
			       int oif, struct flowi6 *fl6,
			       const struct sk_buff *skb, int flags)
{
	struct fib6_info *f6i;
	struct rt6_info *rt;
	int strict = 0;

	strict |= flags & RT6_LOOKUP_F_IFACE;
	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
	if (net->ipv6.devconf_all->forwarding == 0)
		strict |= RT6_LOOKUP_F_REACHABLE;

	rcu_read_lock();

	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
	if (f6i->fib6_nsiblings)
		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);

1847
	if (f6i == net->ipv6.fib6_null_entry) {
D
David Ahern 已提交
1848
		rt = net->ipv6.ip6_null_entry;
1849
		rcu_read_unlock();
1850 1851
		dst_hold(&rt->dst);
		return rt;
1852 1853 1854 1855 1856
	}

	/*Search through exception table */
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1857
		if (ip6_hold_safe(net, &rt))
1858
			dst_use_noref(&rt->dst, jiffies);
1859

1860
		rcu_read_unlock();
M
Martin KaFai Lau 已提交
1861
		return rt;
1862
	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1863
			    !f6i->fib6_nh.fib_nh_has_gw)) {
1864 1865 1866 1867 1868 1869 1870
		/* Create a RTF_CACHE clone which will not be
		 * owned by the fib6 tree.  It is for the special case where
		 * the daddr in the skb during the neighbor look-up is different
		 * from the fl6->daddr used to look-up route here.
		 */
		struct rt6_info *uncached_rt;

1871
		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
M
Martin KaFai Lau 已提交
1872

1873
		rcu_read_unlock();
T
Thomas Graf 已提交
1874

1875 1876 1877 1878
		if (uncached_rt) {
			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
			 * No need for another dst_hold()
			 */
1879
			rt6_uncached_list_add(uncached_rt);
W
Wei Wang 已提交
1880
			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1881
		} else {
1882
			uncached_rt = net->ipv6.ip6_null_entry;
1883 1884
			dst_hold(&uncached_rt->dst);
		}
D
David Ahern 已提交
1885

1886
		return uncached_rt;
M
Martin KaFai Lau 已提交
1887 1888 1889 1890 1891
	} else {
		/* Get a percpu copy */

		struct rt6_info *pcpu_rt;

1892
		local_bh_disable();
1893
		pcpu_rt = rt6_get_pcpu_route(f6i);
M
Martin KaFai Lau 已提交
1894

1895 1896 1897
		if (!pcpu_rt)
			pcpu_rt = rt6_make_pcpu_route(net, f6i);

1898 1899
		local_bh_enable();
		rcu_read_unlock();
1900

M
Martin KaFai Lau 已提交
1901 1902
		return pcpu_rt;
	}
L
Linus Torvalds 已提交
1903
}
1904
EXPORT_SYMBOL_GPL(ip6_pol_route);
L
Linus Torvalds 已提交
1905

D
David Ahern 已提交
1906 1907 1908 1909 1910
static struct rt6_info *ip6_pol_route_input(struct net *net,
					    struct fib6_table *table,
					    struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    int flags)
1911
{
D
David Ahern 已提交
1912
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1913 1914
}

1915 1916
struct dst_entry *ip6_route_input_lookup(struct net *net,
					 struct net_device *dev,
D
David Ahern 已提交
1917 1918 1919
					 struct flowi6 *fl6,
					 const struct sk_buff *skb,
					 int flags)
1920 1921 1922 1923
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

D
David Ahern 已提交
1924
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1925
}
1926
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1927

1928
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1929 1930
				  struct flow_keys *keys,
				  struct flow_keys *flkeys)
1931 1932 1933
{
	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
	const struct ipv6hdr *key_iph = outer_iph;
1934
	struct flow_keys *_flkeys = flkeys;
1935 1936 1937
	const struct ipv6hdr *inner_iph;
	const struct icmp6hdr *icmph;
	struct ipv6hdr _inner_iph;
1938
	struct icmp6hdr _icmph;
1939 1940 1941 1942

	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
		goto out;

1943 1944 1945 1946 1947
	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
				   sizeof(_icmph), &_icmph);
	if (!icmph)
		goto out;

1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
	    icmph->icmp6_type != ICMPV6_PARAMPROB)
		goto out;

	inner_iph = skb_header_pointer(skb,
				       skb_transport_offset(skb) + sizeof(*icmph),
				       sizeof(_inner_iph), &_inner_iph);
	if (!inner_iph)
		goto out;

	key_iph = inner_iph;
1961
	_flkeys = NULL;
1962
out:
1963 1964 1965 1966 1967 1968 1969 1970
	if (_flkeys) {
		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
		keys->tags.flow_label = _flkeys->tags.flow_label;
		keys->basic.ip_proto = _flkeys->basic.ip_proto;
	} else {
		keys->addrs.v6addrs.src = key_iph->saddr;
		keys->addrs.v6addrs.dst = key_iph->daddr;
1971
		keys->tags.flow_label = ip6_flowlabel(key_iph);
1972 1973
		keys->basic.ip_proto = key_iph->nexthdr;
	}
1974 1975 1976
}

/* if skb is set it will be used and fl6 can be NULL */
1977 1978
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
		       const struct sk_buff *skb, struct flow_keys *flkeys)
1979 1980
{
	struct flow_keys hash_keys;
1981
	u32 mhash;
1982

1983
	switch (ip6_multipath_hash_policy(net)) {
1984 1985 1986 1987 1988 1989 1990 1991
	case 0:
		memset(&hash_keys, 0, sizeof(hash_keys));
		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
		if (skb) {
			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
		} else {
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
1992
			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
	case 1:
		if (skb) {
			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
			struct flow_keys keys;

			/* short-circuit if we already have L4 hash present */
			if (skb->l4_hash)
				return skb_get_hash_raw(skb) >> 1;

			memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
				skb_flow_dissect_flow_keys(skb, &keys, flag);
				flkeys = &keys;
			}
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
			hash_keys.ports.src = flkeys->ports.src;
			hash_keys.ports.dst = flkeys->ports.dst;
			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
		} else {
			memset(&hash_keys, 0, sizeof(hash_keys));
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.ports.src = fl6->fl6_sport;
			hash_keys.ports.dst = fl6->fl6_dport;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
2027
	}
2028
	mhash = flow_hash_from_keys(&hash_keys);
2029

2030
	return mhash >> 1;
2031 2032
}

T
Thomas Graf 已提交
2033 2034
void ip6_route_input(struct sk_buff *skb)
{
2035
	const struct ipv6hdr *iph = ipv6_hdr(skb);
2036
	struct net *net = dev_net(skb->dev);
2037
	int flags = RT6_LOOKUP_F_HAS_SADDR;
2038
	struct ip_tunnel_info *tun_info;
2039
	struct flowi6 fl6 = {
2040
		.flowi6_iif = skb->dev->ifindex,
2041 2042
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2043
		.flowlabel = ip6_flowinfo(iph),
2044 2045
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
2046
	};
2047
	struct flow_keys *flkeys = NULL, _flkeys;
2048

2049
	tun_info = skb_tunnel_info(skb);
2050
	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2051
		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2052 2053 2054 2055

	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
		flkeys = &_flkeys;

2056
	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2057
		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2058
	skb_dst_drop(skb);
D
David Ahern 已提交
2059 2060
	skb_dst_set(skb,
		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
T
Thomas Graf 已提交
2061 2062
}

D
David Ahern 已提交
2063 2064 2065 2066 2067
static struct rt6_info *ip6_pol_route_output(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
2068
{
D
David Ahern 已提交
2069
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
T
Thomas Graf 已提交
2070 2071
}

2072 2073
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
					 struct flowi6 *fl6, int flags)
T
Thomas Graf 已提交
2074
{
2075
	bool any_src;
T
Thomas Graf 已提交
2076

2077 2078
	if (ipv6_addr_type(&fl6->daddr) &
	    (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL)) {
2079 2080 2081 2082 2083 2084
		struct dst_entry *dst;

		dst = l3mdev_link_scope_lookup(net, fl6);
		if (dst)
			return dst;
	}
D
David Ahern 已提交
2085

2086
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2087

2088
	any_src = ipv6_addr_any(&fl6->saddr);
2089
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2090
	    (fl6->flowi6_oif && any_src))
2091
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
2092

2093
	if (!any_src)
2094
		flags |= RT6_LOOKUP_F_HAS_SADDR;
2095 2096
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2097

D
David Ahern 已提交
2098
	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
2099
}
2100
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
L
Linus Torvalds 已提交
2101

2102
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2103
{
2104
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2105
	struct net_device *loopback_dev = net->loopback_dev;
2106 2107
	struct dst_entry *new = NULL;

2108
	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2109
		       DST_OBSOLETE_DEAD, 0);
2110
	if (rt) {
2111
		rt6_info_init(rt);
W
Wei Wang 已提交
2112
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2113

2114
		new = &rt->dst;
2115
		new->__use = 1;
2116
		new->input = dst_discard;
E
Eric W. Biederman 已提交
2117
		new->output = dst_discard_out;
2118

2119
		dst_copy_metrics(new, &ort->dst);
2120

2121
		rt->rt6i_idev = in6_dev_get(loopback_dev);
A
Alexey Dobriyan 已提交
2122
		rt->rt6i_gateway = ort->rt6i_gateway;
2123
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2124 2125 2126 2127 2128 2129 2130

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
	}

2131 2132
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
2133 2134
}

L
Linus Torvalds 已提交
2135 2136 2137 2138
/*
 *	Destination cache support functions
 */

2139
static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2140
{
2141 2142
	u32 rt_cookie = 0;

2143
	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2144 2145 2146 2147 2148 2149
		return false;

	if (fib6_check_expired(f6i))
		return false;

	return true;
2150 2151
}

2152 2153 2154
static struct dst_entry *rt6_check(struct rt6_info *rt,
				   struct fib6_info *from,
				   u32 cookie)
2155
{
2156
	u32 rt_cookie = 0;
2157

2158
	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2159
	    rt_cookie != cookie)
2160 2161 2162 2163 2164 2165 2166 2167
		return NULL;

	if (rt6_check_expired(rt))
		return NULL;

	return &rt->dst;
}

2168 2169 2170
static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
					    struct fib6_info *from,
					    u32 cookie)
2171
{
2172 2173
	if (!__rt6_check_expired(rt) &&
	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2174
	    fib6_check(from, cookie))
2175 2176 2177 2178 2179
		return &rt->dst;
	else
		return NULL;
}

L
Linus Torvalds 已提交
2180 2181
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
2182
	struct dst_entry *dst_ret;
2183
	struct fib6_info *from;
L
Linus Torvalds 已提交
2184 2185
	struct rt6_info *rt;

2186 2187 2188
	rt = container_of(dst, struct rt6_info, dst);

	rcu_read_lock();
L
Linus Torvalds 已提交
2189

2190 2191 2192 2193
	/* All IPV6 dsts are created with ->obsolete set to the value
	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
	 * into this function always.
	 */
2194

2195
	from = rcu_dereference(rt->from);
2196

2197 2198 2199
	if (from && (rt->rt6i_flags & RTF_PCPU ||
	    unlikely(!list_empty(&rt->rt6i_uncached))))
		dst_ret = rt6_dst_from_check(rt, from, cookie);
2200
	else
2201
		dst_ret = rt6_check(rt, from, cookie);
2202 2203 2204 2205

	rcu_read_unlock();

	return dst_ret;
L
Linus Torvalds 已提交
2206 2207 2208 2209 2210 2211 2212
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
2213
		if (rt->rt6i_flags & RTF_CACHE) {
2214
			rcu_read_lock();
2215
			if (rt6_check_expired(rt)) {
2216
				rt6_remove_exception_rt(rt);
2217 2218
				dst = NULL;
			}
2219
			rcu_read_unlock();
2220
		} else {
L
Linus Torvalds 已提交
2221
			dst_release(dst);
2222 2223
			dst = NULL;
		}
L
Linus Torvalds 已提交
2224
	}
2225
	return dst;
L
Linus Torvalds 已提交
2226 2227 2228 2229 2230 2231
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

2232
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
2233

E
Eric Dumazet 已提交
2234
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
2235
	if (rt) {
2236
		rcu_read_lock();
2237
		if (rt->rt6i_flags & RTF_CACHE) {
2238
			rt6_remove_exception_rt(rt);
2239
		} else {
2240
			struct fib6_info *from;
2241 2242
			struct fib6_node *fn;

2243 2244 2245 2246 2247 2248
			from = rcu_dereference(rt->from);
			if (from) {
				fn = rcu_dereference(from->fib6_node);
				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
					fn->fn_sernum = -1;
			}
2249
		}
2250
		rcu_read_unlock();
L
Linus Torvalds 已提交
2251 2252 2253
	}
}

2254 2255
static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
2256 2257 2258 2259 2260 2261 2262 2263 2264
	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
		struct fib6_info *from;

		rcu_read_lock();
		from = rcu_dereference(rt0->from);
		if (from)
			rt0->dst.expires = from->expires;
		rcu_read_unlock();
	}
2265 2266 2267 2268 2269

	dst_set_expires(&rt0->dst, timeout);
	rt0->rt6i_flags |= RTF_EXPIRES;
}

2270 2271 2272 2273
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
	struct net *net = dev_net(rt->dst.dev);

2274
	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2275 2276 2277 2278
	rt->rt6i_flags |= RTF_MODIFIED;
	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

2279 2280 2281
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
	return !(rt->rt6i_flags & RTF_CACHE) &&
2282
		(rt->rt6i_flags & RTF_PCPU || rcu_access_pointer(rt->from));
2283 2284
}

2285 2286
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
				 const struct ipv6hdr *iph, u32 mtu)
L
Linus Torvalds 已提交
2287
{
2288
	const struct in6_addr *daddr, *saddr;
2289
	struct rt6_info *rt6 = (struct rt6_info *)dst;
L
Linus Torvalds 已提交
2290

2291 2292 2293
	if (dst_metric_locked(dst, RTAX_MTU))
		return;

2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304
	if (iph) {
		daddr = &iph->daddr;
		saddr = &iph->saddr;
	} else if (sk) {
		daddr = &sk->sk_v6_daddr;
		saddr = &inet6_sk(sk)->saddr;
	} else {
		daddr = NULL;
		saddr = NULL;
	}
	dst_confirm_neigh(dst, daddr);
2305 2306 2307
	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
	if (mtu >= dst_mtu(dst))
		return;
2308

2309
	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2310
		rt6_do_update_pmtu(rt6, mtu);
2311 2312 2313
		/* update rt6_ex->stamp for cache */
		if (rt6->rt6i_flags & RTF_CACHE)
			rt6_update_exception_stamp_rt(rt6);
2314
	} else if (daddr) {
2315
		struct fib6_info *from;
2316 2317
		struct rt6_info *nrt6;

2318
		rcu_read_lock();
2319 2320
		from = rcu_dereference(rt6->from);
		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2321 2322
		if (nrt6) {
			rt6_do_update_pmtu(nrt6, mtu);
2323
			if (rt6_insert_exception(nrt6, from))
2324
				dst_release_immediate(&nrt6->dst);
2325
		}
2326
		rcu_read_unlock();
L
Linus Torvalds 已提交
2327 2328 2329
	}
}

2330 2331 2332 2333 2334 2335
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
{
	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}

2336
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2337
		     int oif, u32 mark, kuid_t uid)
2338 2339 2340
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
2341 2342 2343 2344 2345 2346 2347 2348
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark),
		.daddr = iph->daddr,
		.saddr = iph->saddr,
		.flowlabel = ip6_flowinfo(iph),
		.flowi6_uid = uid,
	};
2349 2350 2351

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
2352
		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2353 2354 2355 2356 2357 2358
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
2359
	int oif = sk->sk_bound_dev_if;
2360 2361
	struct dst_entry *dst;

2362 2363 2364 2365
	if (!oif && skb->dev)
		oif = l3mdev_master_ifindex(skb->dev);

	ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2366 2367 2368 2369 2370 2371 2372 2373 2374 2375

	dst = __sk_dst_get(sk);
	if (!dst || !dst->obsolete ||
	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
		return;

	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		ip6_datagram_dst_update(sk, false);
	bh_unlock_sock(sk);
2376 2377 2378
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
			   const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
	struct ipv6_pinfo *np = inet6_sk(sk);
#endif

	ip6_dst_store(sk, dst,
		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
		      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
		      &np->saddr :
#endif
		      NULL);
}

2396 2397 2398 2399 2400 2401 2402 2403 2404
/* Handle redirects */
struct ip6rd_flowi {
	struct flowi6 fl6;
	struct in6_addr gateway;
};

static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
D
David Ahern 已提交
2405
					     const struct sk_buff *skb,
2406 2407 2408
					     int flags)
{
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2409
	struct rt6_info *ret = NULL, *rt_cache;
2410
	struct fib6_info *rt;
2411 2412 2413
	struct fib6_node *fn;

	/* Get the "current" route for this destination and
A
Alexander Alemayhu 已提交
2414
	 * check if the redirect has come from appropriate router.
2415 2416 2417 2418 2419 2420 2421 2422
	 *
	 * RFC 4861 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
	 */

2423
	rcu_read_lock();
2424
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2425
restart:
2426
	for_each_fib6_node_rt_rcu(fn) {
D
David Ahern 已提交
2427
		if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
2428
			continue;
2429
		if (fib6_check_expired(rt))
2430
			continue;
2431
		if (rt->fib6_flags & RTF_REJECT)
2432
			break;
2433
		if (!rt->fib6_nh.fib_nh_has_gw)
2434
			continue;
D
David Ahern 已提交
2435
		if (fl6->flowi6_oif != rt->fib6_nh.fib_nh_dev->ifindex)
2436
			continue;
2437 2438 2439 2440 2441
		/* rt_cache's gateway might be different from its 'parent'
		 * in the case of an ip redirect.
		 * So we keep searching in the exception table if the gateway
		 * is different.
		 */
D
David Ahern 已提交
2442
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.fib_nh_gw6)) {
2443 2444 2445 2446 2447 2448
			rt_cache = rt6_find_cached_rt(rt,
						      &fl6->daddr,
						      &fl6->saddr);
			if (rt_cache &&
			    ipv6_addr_equal(&rdfl->gateway,
					    &rt_cache->rt6i_gateway)) {
2449
				ret = rt_cache;
2450 2451
				break;
			}
2452
			continue;
2453
		}
2454 2455 2456 2457
		break;
	}

	if (!rt)
D
David Ahern 已提交
2458
		rt = net->ipv6.fib6_null_entry;
2459
	else if (rt->fib6_flags & RTF_REJECT) {
2460
		ret = net->ipv6.ip6_null_entry;
2461 2462 2463
		goto out;
	}

D
David Ahern 已提交
2464
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
2465 2466 2467
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
2468
	}
M
Martin KaFai Lau 已提交
2469

2470
out:
2471
	if (ret)
2472
		ip6_hold_safe(net, &ret);
2473 2474
	else
		ret = ip6_create_rt_rcu(rt);
2475

2476
	rcu_read_unlock();
2477

2478
	trace_fib6_table_lookup(net, rt, table, fl6);
2479
	return ret;
2480 2481 2482
};

static struct dst_entry *ip6_route_redirect(struct net *net,
D
David Ahern 已提交
2483 2484 2485
					    const struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    const struct in6_addr *gateway)
2486 2487 2488 2489 2490 2491 2492
{
	int flags = RT6_LOOKUP_F_HAS_SADDR;
	struct ip6rd_flowi rdfl;

	rdfl.fl6 = *fl6;
	rdfl.gateway = *gateway;

D
David Ahern 已提交
2493
	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2494 2495 2496
				flags, __ip6_route_redirect);
}

2497 2498
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
		  kuid_t uid)
2499 2500 2501
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
2502 2503 2504 2505 2506 2507 2508 2509 2510
	struct flowi6 fl6 = {
		.flowi6_iif = LOOPBACK_IFINDEX,
		.flowi6_oif = oif,
		.flowi6_mark = mark,
		.daddr = iph->daddr,
		.saddr = iph->saddr,
		.flowlabel = ip6_flowinfo(iph),
		.flowi6_uid = uid,
	};
2511

D
David Ahern 已提交
2512
	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2513
	rt6_do_redirect(dst, NULL, skb);
2514 2515 2516 2517
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

2518
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif)
2519 2520 2521 2522
{
	const struct ipv6hdr *iph = ipv6_hdr(skb);
	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
	struct dst_entry *dst;
2523 2524 2525 2526 2527 2528 2529
	struct flowi6 fl6 = {
		.flowi6_iif = LOOPBACK_IFINDEX,
		.flowi6_oif = oif,
		.daddr = msg->dest,
		.saddr = iph->daddr,
		.flowi6_uid = sock_net_uid(net, NULL),
	};
2530

D
David Ahern 已提交
2531
	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2532
	rt6_do_redirect(dst, NULL, skb);
2533 2534 2535
	dst_release(dst);
}

2536 2537
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
2538 2539
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
		     sk->sk_uid);
2540 2541 2542
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

2543
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
2544
{
2545 2546 2547 2548
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
2549 2550
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

2551 2552
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
2553 2554

	/*
2555 2556 2557
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
2558 2559 2560 2561 2562 2563 2564
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

2565
static unsigned int ip6_mtu(const struct dst_entry *dst)
2566 2567
{
	struct inet6_dev *idev;
2568
	unsigned int mtu;
2569 2570

	mtu = dst_metric_raw(dst, RTAX_MTU);
2571
	if (mtu)
E
Eric Dumazet 已提交
2572
		goto out;
2573 2574

	mtu = IPV6_MIN_MTU;
2575 2576 2577 2578 2579 2580 2581

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

E
Eric Dumazet 已提交
2582
out:
2583 2584 2585
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2586 2587
}

2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 *
 * based on ip6_dst_mtu_forward and exception logic of
 * rt6_find_cached_rt; called with rcu_read_lock
 */
u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
		      struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct in6_addr *src_key;
	struct inet6_dev *idev;
	u32 mtu = 0;

	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
		mtu = f6i->fib6_pmtu;
		if (mtu)
			goto out;
	}

	src_key = NULL;
#ifdef CONFIG_IPV6_SUBTREES
	if (f6i->fib6_src.plen)
		src_key = saddr;
#endif

	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);

	if (likely(!mtu)) {
		struct net_device *dev = fib6_info_nh_dev(f6i);

		mtu = IPV6_MIN_MTU;
		idev = __in6_dev_get(dev);
		if (idev && idev->cnf.mtu6 > mtu)
			mtu = idev->cnf.mtu6;
	}

	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
}

2636
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2637
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
2638
{
2639
	struct dst_entry *dst;
L
Linus Torvalds 已提交
2640 2641
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
2642
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
2643

2644
	if (unlikely(!idev))
E
Eric Dumazet 已提交
2645
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
2646

2647
	rt = ip6_dst_alloc(net, dev, 0);
2648
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
2649
		in6_dev_put(idev);
2650
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
2651 2652 2653
		goto out;
	}

2654
	rt->dst.flags |= DST_HOST;
2655
	rt->dst.input = ip6_input;
2656
	rt->dst.output  = ip6_output;
2657
	rt->rt6i_gateway  = fl6->daddr;
2658
	rt->rt6i_dst.addr = fl6->daddr;
2659 2660
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
L
Li RongQing 已提交
2661
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
L
Linus Torvalds 已提交
2662

2663
	/* Add this dst into uncached_list so that rt6_disable_ip() can
2664 2665 2666
	 * do proper release of the net_device
	 */
	rt6_uncached_list_add(rt);
W
Wei Wang 已提交
2667
	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
L
Linus Torvalds 已提交
2668

2669 2670
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
2671
out:
2672
	return dst;
L
Linus Torvalds 已提交
2673 2674
}

2675
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
2676
{
2677
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2678 2679 2680 2681 2682
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2683
	int entries;
2684

2685
	entries = dst_entries_get_fast(ops);
2686
	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2687
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
2688 2689
		goto out;

2690
	net->ipv6.ip6_rt_gc_expire++;
2691
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2692 2693
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
2694
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
2695
out:
2696
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2697
	return entries > rt_max_size;
L
Linus Torvalds 已提交
2698 2699
}

2700 2701
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
					    struct fib6_config *cfg,
2702 2703
					    const struct in6_addr *gw_addr,
					    u32 tbid, int flags)
2704 2705 2706 2707 2708 2709 2710 2711 2712
{
	struct flowi6 fl6 = {
		.flowi6_oif = cfg->fc_ifindex,
		.daddr = *gw_addr,
		.saddr = cfg->fc_prefsrc,
	};
	struct fib6_table *table;
	struct rt6_info *rt;

2713
	table = fib6_get_table(net, tbid);
2714 2715 2716 2717 2718 2719
	if (!table)
		return NULL;

	if (!ipv6_addr_any(&cfg->fc_prefsrc))
		flags |= RT6_LOOKUP_F_HAS_SADDR;

2720
	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
D
David Ahern 已提交
2721
	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2722 2723 2724 2725 2726 2727 2728 2729 2730 2731

	/* if table lookup failed, fall back to full lookup */
	if (rt == net->ipv6.ip6_null_entry) {
		ip6_rt_put(rt);
		rt = NULL;
	}

	return rt;
}

2732 2733
static int ip6_route_check_nh_onlink(struct net *net,
				     struct fib6_config *cfg,
2734
				     const struct net_device *dev,
2735 2736
				     struct netlink_ext_ack *extack)
{
2737
	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2738 2739
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2740
	struct fib6_info *from;
2741 2742 2743 2744 2745 2746
	struct rt6_info *grt;
	int err;

	err = 0;
	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
	if (grt) {
2747 2748
		rcu_read_lock();
		from = rcu_dereference(grt->from);
2749
		if (!grt->dst.error &&
2750
		    /* ignore match if it is the default route */
2751
		    from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2752
		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2753 2754
			NL_SET_ERR_MSG(extack,
				       "Nexthop has invalid gateway or device mismatch");
2755 2756
			err = -EINVAL;
		}
2757
		rcu_read_unlock();
2758 2759 2760 2761 2762 2763 2764

		ip6_rt_put(grt);
	}

	return err;
}

2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775
static int ip6_route_check_nh(struct net *net,
			      struct fib6_config *cfg,
			      struct net_device **_dev,
			      struct inet6_dev **idev)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	struct net_device *dev = _dev ? *_dev : NULL;
	struct rt6_info *grt = NULL;
	int err = -EHOSTUNREACH;

	if (cfg->fc_table) {
2776 2777 2778 2779
		int flags = RT6_LOOKUP_F_IFACE;

		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
					  cfg->fc_table, flags);
2780 2781 2782 2783 2784 2785 2786 2787 2788 2789
		if (grt) {
			if (grt->rt6i_flags & RTF_GATEWAY ||
			    (dev && dev != grt->dst.dev)) {
				ip6_rt_put(grt);
				grt = NULL;
			}
		}
	}

	if (!grt)
D
David Ahern 已提交
2790
		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815

	if (!grt)
		goto out;

	if (dev) {
		if (dev != grt->dst.dev) {
			ip6_rt_put(grt);
			goto out;
		}
	} else {
		*_dev = dev = grt->dst.dev;
		*idev = grt->rt6i_idev;
		dev_hold(dev);
		in6_dev_hold(grt->rt6i_idev);
	}

	if (!(grt->rt6i_flags & RTF_GATEWAY))
		err = 0;

	ip6_rt_put(grt);

out:
	return err;
}

2816 2817 2818 2819 2820 2821
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
			   struct net_device **_dev, struct inet6_dev **idev,
			   struct netlink_ext_ack *extack)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	int gwa_type = ipv6_addr_type(gw_addr);
2822
	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2823
	const struct net_device *dev = *_dev;
2824
	bool need_addr_check = !dev;
2825 2826 2827 2828 2829 2830 2831
	int err = -EINVAL;

	/* if gw_addr is local we will fail to detect this in case
	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
	 * will return already-added prefix route via interface that
	 * prefix route was assigned to, which might be non-loopback.
	 */
2832 2833 2834
	if (dev &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873
		goto out;
	}

	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
		/* IPv6 strictly inhibits using not link-local
		 * addresses as nexthop address.
		 * Otherwise, router will not able to send redirects.
		 * It is very good, but in some (rare!) circumstances
		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
		 * some exceptions. --ANK
		 * We allow IPv4-mapped nexthops to support RFC4798-type
		 * addressing
		 */
		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
			NL_SET_ERR_MSG(extack, "Invalid gateway address");
			goto out;
		}

		if (cfg->fc_flags & RTNH_F_ONLINK)
			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
		else
			err = ip6_route_check_nh(net, cfg, _dev, idev);

		if (err)
			goto out;
	}

	/* reload in case device was changed */
	dev = *_dev;

	err = -EINVAL;
	if (!dev) {
		NL_SET_ERR_MSG(extack, "Egress device not specified");
		goto out;
	} else if (dev->flags & IFF_LOOPBACK) {
		NL_SET_ERR_MSG(extack,
			       "Egress device can not be loopback device for this route");
		goto out;
	}
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883

	/* if we did not check gw_addr above, do so now that the
	 * egress device has been resolved.
	 */
	if (need_addr_check &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
		goto out;
	}

2884 2885 2886 2887 2888
	err = 0;
out:
	return err;
}

2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
static bool fib6_is_reject(u32 flags, struct net_device *dev, int addr_type)
{
	if ((flags & RTF_REJECT) ||
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(flags & RTF_LOCAL)))
		return true;

	return false;
}

int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
		 struct fib6_config *cfg, gfp_t gfp_flags,
		 struct netlink_ext_ack *extack)
{
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
	int addr_type;
	int err;

2909 2910
	fib6_nh->fib_nh_family = AF_INET6;

2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933
	err = -ENODEV;
	if (cfg->fc_ifindex) {
		dev = dev_get_by_index(net, cfg->fc_ifindex);
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

	if (cfg->fc_flags & RTNH_F_ONLINK) {
		if (!dev) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop device required for onlink");
			goto out;
		}

		if (!(dev->flags & IFF_UP)) {
			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
			err = -ENETDOWN;
			goto out;
		}

D
David Ahern 已提交
2934
		fib6_nh->fib_nh_flags |= RTNH_F_ONLINK;
2935 2936
	}

D
David Ahern 已提交
2937
	fib6_nh->fib_nh_weight = 1;
2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965

	/* We cannot add true routes via loopback here,
	 * they would result in kernel looping; promote them to reject routes
	 */
	addr_type = ipv6_addr_type(&cfg->fc_dst);
	if (fib6_is_reject(cfg->fc_flags, dev, addr_type)) {
		/* hold loopback dev/idev if we haven't done so. */
		if (dev != net->loopback_dev) {
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
			dev = net->loopback_dev;
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
		goto set_dev;
	}

	if (cfg->fc_flags & RTF_GATEWAY) {
		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
		if (err)
			goto out;

D
David Ahern 已提交
2966
		fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
2967
		fib6_nh->fib_nh_has_gw = 1;
2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987
	}

	err = -ENODEV;
	if (!dev)
		goto out;

	if (idev->cnf.disable_ipv6) {
		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
		err = -EACCES;
		goto out;
	}

	if (!(dev->flags & IFF_UP) && !cfg->fc_ignore_dev_down) {
		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
		err = -ENETDOWN;
		goto out;
	}

	if (!(cfg->fc_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
	    !netif_carrier_ok(dev))
D
David Ahern 已提交
2988
		fib6_nh->fib_nh_flags |= RTNH_F_LINKDOWN;
2989

2990 2991 2992 2993
	err = fib_nh_common_init(&fib6_nh->nh_common, cfg->fc_encap,
				 cfg->fc_encap_type, cfg, gfp_flags, extack);
	if (err)
		goto out;
2994
set_dev:
D
David Ahern 已提交
2995
	fib6_nh->fib_nh_dev = dev;
2996
	fib6_nh->fib_nh_oif = dev->ifindex;
2997 2998 2999 3000 3001 3002
	err = 0;
out:
	if (idev)
		in6_dev_put(idev);

	if (err) {
D
David Ahern 已提交
3003 3004
		lwtstate_put(fib6_nh->fib_nh_lws);
		fib6_nh->fib_nh_lws = NULL;
3005 3006 3007 3008 3009 3010 3011
		if (dev)
			dev_put(dev);
	}

	return err;
}

3012 3013
void fib6_nh_release(struct fib6_nh *fib6_nh)
{
3014
	fib_nh_common_release(&fib6_nh->nh_common);
3015 3016
}

3017
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3018
					      gfp_t gfp_flags,
3019
					      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
3020
{
3021
	struct net *net = cfg->fc_nlinfo.nl_net;
3022
	struct fib6_info *rt = NULL;
T
Thomas Graf 已提交
3023
	struct fib6_table *table;
3024
	int err = -EINVAL;
3025
	int addr_type;
L
Linus Torvalds 已提交
3026

3027
	/* RTF_PCPU is an internal flag; can not be set by userspace */
3028 3029
	if (cfg->fc_flags & RTF_PCPU) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3030
		goto out;
3031
	}
3032

3033 3034 3035 3036 3037 3038
	/* RTF_CACHE is an internal flag; can not be set by userspace */
	if (cfg->fc_flags & RTF_CACHE) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
		goto out;
	}

3039 3040 3041 3042 3043
	if (cfg->fc_type > RTN_MAX) {
		NL_SET_ERR_MSG(extack, "Invalid route type");
		goto out;
	}

3044 3045 3046 3047 3048 3049
	if (cfg->fc_dst_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		goto out;
	}
	if (cfg->fc_src_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid source address length");
3050
		goto out;
3051
	}
L
Linus Torvalds 已提交
3052
#ifndef CONFIG_IPV6_SUBTREES
3053 3054 3055
	if (cfg->fc_src_len) {
		NL_SET_ERR_MSG(extack,
			       "Specifying source address requires IPV6_SUBTREES to be enabled");
3056
		goto out;
3057
	}
L
Linus Torvalds 已提交
3058
#endif
3059

3060
	err = -ENOBUFS;
3061 3062
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3063
		table = fib6_get_table(net, cfg->fc_table);
3064
		if (!table) {
3065
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3066 3067 3068 3069 3070
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
3071 3072

	if (!table)
T
Thomas Graf 已提交
3073 3074
		goto out;

3075 3076 3077
	err = -ENOMEM;
	rt = fib6_info_alloc(gfp_flags);
	if (!rt)
L
Linus Torvalds 已提交
3078
		goto out;
3079

3080 3081
	rt->fib6_metrics = ip_fib_metrics_init(net, cfg->fc_mx, cfg->fc_mx_len,
					       extack);
3082 3083
	if (IS_ERR(rt->fib6_metrics)) {
		err = PTR_ERR(rt->fib6_metrics);
3084 3085
		/* Do not leave garbage there. */
		rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
3086 3087 3088
		goto out;
	}

3089 3090
	if (cfg->fc_flags & RTF_ADDRCONF)
		rt->dst_nocount = true;
L
Linus Torvalds 已提交
3091

3092
	if (cfg->fc_flags & RTF_EXPIRES)
3093
		fib6_set_expires(rt, jiffies +
3094 3095
				clock_t_to_jiffies(cfg->fc_expires));
	else
3096
		fib6_clean_expires(rt);
L
Linus Torvalds 已提交
3097

3098 3099
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
3100
	rt->fib6_protocol = cfg->fc_protocol;
3101

3102 3103 3104
	rt->fib6_table = table;
	rt->fib6_metric = cfg->fc_metric;
	rt->fib6_type = cfg->fc_type;
3105
	rt->fib6_flags = cfg->fc_flags & ~RTF_GATEWAY;
3106

3107 3108 3109
	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->fib6_dst.plen = cfg->fc_dst_len;
	if (rt->fib6_dst.plen == 128)
3110
		rt->dst_host = true;
3111

L
Linus Torvalds 已提交
3112
#ifdef CONFIG_IPV6_SUBTREES
3113 3114
	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->fib6_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
3115
#endif
3116 3117 3118
	err = fib6_nh_init(net, &rt->fib6_nh, cfg, gfp_flags, extack);
	if (err)
		goto out;
L
Linus Torvalds 已提交
3119 3120

	/* We cannot add true routes via loopback here,
3121
	 * they would result in kernel looping; promote them to reject routes
L
Linus Torvalds 已提交
3122
	 */
3123
	addr_type = ipv6_addr_type(&cfg->fc_dst);
D
David Ahern 已提交
3124
	if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh.fib_nh_dev, addr_type))
3125
		rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
3126

3127
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3128 3129
		struct net_device *dev = fib6_info_nh_dev(rt);

3130
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3131
			NL_SET_ERR_MSG(extack, "Invalid source address");
3132 3133 3134
			err = -EINVAL;
			goto out;
		}
3135 3136
		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
		rt->fib6_prefsrc.plen = 128;
3137
	} else
3138
		rt->fib6_prefsrc.plen = 0;
3139

3140
	return rt;
3141
out:
3142
	fib6_info_release(rt);
3143
	return ERR_PTR(err);
3144 3145
}

3146
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3147
		  struct netlink_ext_ack *extack)
3148
{
3149
	struct fib6_info *rt;
3150 3151
	int err;

3152
	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3153 3154
	if (IS_ERR(rt))
		return PTR_ERR(rt);
3155

3156
	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3157
	fib6_info_release(rt);
3158

L
Linus Torvalds 已提交
3159 3160 3161
	return err;
}

3162
static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
3163
{
3164
	struct net *net = info->nl_net;
T
Thomas Graf 已提交
3165
	struct fib6_table *table;
3166
	int err;
L
Linus Torvalds 已提交
3167

D
David Ahern 已提交
3168
	if (rt == net->ipv6.fib6_null_entry) {
3169 3170 3171
		err = -ENOENT;
		goto out;
	}
3172

3173
	table = rt->fib6_table;
3174
	spin_lock_bh(&table->tb6_lock);
3175
	err = fib6_del(rt, info);
3176
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
3177

3178
out:
3179
	fib6_info_release(rt);
L
Linus Torvalds 已提交
3180 3181 3182
	return err;
}

3183
int ip6_del_rt(struct net *net, struct fib6_info *rt)
3184
{
3185 3186
	struct nl_info info = { .nl_net = net };

3187
	return __ip6_del_rt(rt, &info);
3188 3189
}

3190
static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3191 3192
{
	struct nl_info *info = &cfg->fc_nlinfo;
3193
	struct net *net = info->nl_net;
3194
	struct sk_buff *skb = NULL;
3195
	struct fib6_table *table;
3196
	int err = -ENOENT;
3197

D
David Ahern 已提交
3198
	if (rt == net->ipv6.fib6_null_entry)
3199
		goto out_put;
3200
	table = rt->fib6_table;
3201
	spin_lock_bh(&table->tb6_lock);
3202

3203
	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3204
		struct fib6_info *sibling, *next_sibling;
3205

3206 3207 3208 3209 3210
		/* prefer to send a single notification with all hops */
		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
		if (skb) {
			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

3211
			if (rt6_fill_node(net, skb, rt, NULL,
3212 3213 3214 3215 3216 3217 3218 3219
					  NULL, NULL, 0, RTM_DELROUTE,
					  info->portid, seq, 0) < 0) {
				kfree_skb(skb);
				skb = NULL;
			} else
				info->skip_notify = 1;
		}

3220
		list_for_each_entry_safe(sibling, next_sibling,
3221 3222
					 &rt->fib6_siblings,
					 fib6_siblings) {
3223 3224
			err = fib6_del(sibling, info);
			if (err)
3225
				goto out_unlock;
3226 3227 3228 3229
		}
	}

	err = fib6_del(rt, info);
3230
out_unlock:
3231
	spin_unlock_bh(&table->tb6_lock);
3232
out_put:
3233
	fib6_info_release(rt);
3234 3235

	if (skb) {
3236
		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3237 3238
			    info->nlh, gfp_any());
	}
3239 3240 3241
	return err;
}

3242 3243 3244 3245 3246 3247 3248 3249 3250 3251
static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
	int rc = -ESRCH;

	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
		goto out;

	if (cfg->fc_flags & RTF_GATEWAY &&
	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
		goto out;
3252 3253

	rc = rt6_remove_exception_rt(rt);
3254 3255 3256 3257
out:
	return rc;
}

3258 3259
static int ip6_route_del(struct fib6_config *cfg,
			 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
3260
{
3261
	struct rt6_info *rt_cache;
T
Thomas Graf 已提交
3262
	struct fib6_table *table;
3263
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3264 3265 3266
	struct fib6_node *fn;
	int err = -ESRCH;

3267
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3268 3269
	if (!table) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
T
Thomas Graf 已提交
3270
		return err;
3271
	}
T
Thomas Graf 已提交
3272

3273
	rcu_read_lock();
L
Linus Torvalds 已提交
3274

T
Thomas Graf 已提交
3275
	fn = fib6_locate(&table->tb6_root,
3276
			 &cfg->fc_dst, cfg->fc_dst_len,
3277
			 &cfg->fc_src, cfg->fc_src_len,
3278
			 !(cfg->fc_flags & RTF_CACHE));
3279

L
Linus Torvalds 已提交
3280
	if (fn) {
3281
		for_each_fib6_node_rt_rcu(fn) {
D
David Ahern 已提交
3282 3283
			struct fib6_nh *nh;

3284
			if (cfg->fc_flags & RTF_CACHE) {
3285 3286
				int rc;

3287 3288
				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
							      &cfg->fc_src);
3289 3290
				if (rt_cache) {
					rc = ip6_del_cached_rt(rt_cache, cfg);
3291 3292
					if (rc != -ESRCH) {
						rcu_read_unlock();
3293
						return rc;
3294
					}
3295 3296
				}
				continue;
3297
			}
D
David Ahern 已提交
3298 3299

			nh = &rt->fib6_nh;
3300
			if (cfg->fc_ifindex &&
D
David Ahern 已提交
3301 3302
			    (!nh->fib_nh_dev ||
			     nh->fib_nh_dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
3303
				continue;
3304
			if (cfg->fc_flags & RTF_GATEWAY &&
D
David Ahern 已提交
3305
			    !ipv6_addr_equal(&cfg->fc_gateway, &nh->fib_nh_gw6))
L
Linus Torvalds 已提交
3306
				continue;
3307
			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
L
Linus Torvalds 已提交
3308
				continue;
3309
			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3310
				continue;
3311 3312
			if (!fib6_info_hold_safe(rt))
				continue;
3313
			rcu_read_unlock();
L
Linus Torvalds 已提交
3314

3315 3316 3317 3318 3319
			/* if gateway was specified only delete the one hop */
			if (cfg->fc_flags & RTF_GATEWAY)
				return __ip6_del_rt(rt, &cfg->fc_nlinfo);

			return __ip6_del_rt_siblings(rt, cfg);
L
Linus Torvalds 已提交
3320 3321
		}
	}
3322
	rcu_read_unlock();
L
Linus Torvalds 已提交
3323 3324 3325 3326

	return err;
}

3327
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3328 3329
{
	struct netevent_redirect netevent;
3330 3331 3332 3333
	struct rt6_info *rt, *nrt = NULL;
	struct ndisc_options ndopts;
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
3334
	struct fib6_info *from;
3335
	struct rd_msg *msg;
3336 3337
	int optlen, on_link;
	u8 *lladdr;
3338

3339
	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3340
	optlen -= sizeof(*msg);
3341 3342

	if (optlen < 0) {
3343
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3344 3345 3346
		return;
	}

3347
	msg = (struct rd_msg *)icmp6_hdr(skb);
3348

3349
	if (ipv6_addr_is_multicast(&msg->dest)) {
3350
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3351 3352 3353
		return;
	}

3354
	on_link = 0;
3355
	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3356
		on_link = 1;
3357
	} else if (ipv6_addr_type(&msg->target) !=
3358
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3359
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

3374
	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3375 3376 3377
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
3378 3379

	lladdr = NULL;
3380 3381 3382 3383 3384 3385 3386 3387 3388
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

3389
	rt = (struct rt6_info *) dst;
3390
	if (rt->rt6i_flags & RTF_REJECT) {
3391
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3392
		return;
3393
	}
3394

3395 3396 3397 3398
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
3399
	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3400

3401
	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3402 3403
	if (!neigh)
		return;
3404

L
Linus Torvalds 已提交
3405 3406 3407 3408
	/*
	 *	We have finally decided to accept it.
	 */

3409
	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
3410 3411 3412
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3413 3414
				     NEIGH_UPDATE_F_ISROUTER)),
		     NDISC_REDIRECT, &ndopts);
L
Linus Torvalds 已提交
3415

3416
	rcu_read_lock();
3417
	from = rcu_dereference(rt->from);
3418 3419 3420
	/* This fib6_info_hold() is safe here because we hold reference to rt
	 * and rt already holds reference to fib6_info.
	 */
3421
	fib6_info_hold(from);
3422
	rcu_read_unlock();
3423 3424

	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3425
	if (!nrt)
L
Linus Torvalds 已提交
3426 3427 3428 3429 3430 3431
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
3432
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
L
Linus Torvalds 已提交
3433

3434 3435 3436 3437
	/* No need to remove rt from the exception table if rt is
	 * a cached route because rt6_insert_exception() will
	 * takes care of it
	 */
3438
	if (rt6_insert_exception(nrt, from)) {
3439 3440 3441
		dst_release_immediate(&nrt->dst);
		goto out;
	}
L
Linus Torvalds 已提交
3442

3443 3444
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
3445
	netevent.daddr = &msg->dest;
3446
	netevent.neigh = neigh;
3447 3448
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

L
Linus Torvalds 已提交
3449
out:
3450
	fib6_info_release(from);
3451
	neigh_release(neigh);
3452 3453
}

3454
#ifdef CONFIG_IPV6_ROUTE_INFO
3455
static struct fib6_info *rt6_get_route_info(struct net *net,
3456
					   const struct in6_addr *prefix, int prefixlen,
3457 3458
					   const struct in6_addr *gwaddr,
					   struct net_device *dev)
3459
{
3460 3461
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
	int ifindex = dev->ifindex;
3462
	struct fib6_node *fn;
3463
	struct fib6_info *rt = NULL;
T
Thomas Graf 已提交
3464 3465
	struct fib6_table *table;

3466
	table = fib6_get_table(net, tb_id);
3467
	if (!table)
T
Thomas Graf 已提交
3468
		return NULL;
3469

3470
	rcu_read_lock();
3471
	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3472 3473 3474
	if (!fn)
		goto out;

3475
	for_each_fib6_node_rt_rcu(fn) {
D
David Ahern 已提交
3476
		if (rt->fib6_nh.fib_nh_dev->ifindex != ifindex)
3477
			continue;
3478 3479
		if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
		    !rt->fib6_nh.fib_nh_has_gw)
3480
			continue;
D
David Ahern 已提交
3481
		if (!ipv6_addr_equal(&rt->fib6_nh.fib_nh_gw6, gwaddr))
3482
			continue;
3483 3484
		if (!fib6_info_hold_safe(rt))
			continue;
3485 3486 3487
		break;
	}
out:
3488
	rcu_read_unlock();
3489 3490 3491
	return rt;
}

3492
static struct fib6_info *rt6_add_route_info(struct net *net,
3493
					   const struct in6_addr *prefix, int prefixlen,
3494 3495
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
3496
					   unsigned int pref)
3497
{
3498
	struct fib6_config cfg = {
3499
		.fc_metric	= IP6_RT_PRIO_USER,
3500
		.fc_ifindex	= dev->ifindex,
3501 3502 3503
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
3504
		.fc_protocol = RTPROT_RA,
3505
		.fc_type = RTN_UNICAST,
3506
		.fc_nlinfo.portid = 0,
3507 3508
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
3509 3510
	};

3511
	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
A
Alexey Dobriyan 已提交
3512 3513
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
3514

3515 3516
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
3517
		cfg.fc_flags |= RTF_DEFAULT;
3518

3519
	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3520

3521
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3522 3523 3524
}
#endif

3525
struct fib6_info *rt6_get_dflt_router(struct net *net,
3526 3527
				     const struct in6_addr *addr,
				     struct net_device *dev)
3528
{
3529
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3530
	struct fib6_info *rt;
T
Thomas Graf 已提交
3531
	struct fib6_table *table;
L
Linus Torvalds 已提交
3532

3533
	table = fib6_get_table(net, tb_id);
3534
	if (!table)
T
Thomas Graf 已提交
3535
		return NULL;
L
Linus Torvalds 已提交
3536

3537 3538
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
D
David Ahern 已提交
3539 3540 3541
		struct fib6_nh *nh = &rt->fib6_nh;

		if (dev == nh->fib_nh_dev &&
3542
		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
D
David Ahern 已提交
3543
		    ipv6_addr_equal(&nh->fib_nh_gw6, addr))
L
Linus Torvalds 已提交
3544 3545
			break;
	}
3546 3547
	if (rt && !fib6_info_hold_safe(rt))
		rt = NULL;
3548
	rcu_read_unlock();
L
Linus Torvalds 已提交
3549 3550 3551
	return rt;
}

3552
struct fib6_info *rt6_add_dflt_router(struct net *net,
3553
				     const struct in6_addr *gwaddr,
3554 3555
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
3556
{
3557
	struct fib6_config cfg = {
D
David Ahern 已提交
3558
		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3559
		.fc_metric	= IP6_RT_PRIO_USER,
3560 3561 3562
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3563
		.fc_protocol = RTPROT_RA,
3564
		.fc_type = RTN_UNICAST,
3565
		.fc_nlinfo.portid = 0,
3566
		.fc_nlinfo.nlh = NULL,
3567
		.fc_nlinfo.nl_net = net,
3568
	};
L
Linus Torvalds 已提交
3569

A
Alexey Dobriyan 已提交
3570
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
3571

3572
	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3573 3574 3575 3576 3577 3578
		struct fib6_table *table;

		table = fib6_get_table(dev_net(dev), cfg.fc_table);
		if (table)
			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
	}
L
Linus Torvalds 已提交
3579

3580
	return rt6_get_dflt_router(net, gwaddr, dev);
L
Linus Torvalds 已提交
3581 3582
}

3583 3584
static void __rt6_purge_dflt_routers(struct net *net,
				     struct fib6_table *table)
L
Linus Torvalds 已提交
3585
{
3586
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3587 3588

restart:
3589 3590
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
D
David Ahern 已提交
3591 3592 3593
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

3594
		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3595 3596
		    (!idev || idev->cnf.accept_ra != 2) &&
		    fib6_info_hold_safe(rt)) {
3597 3598
			rcu_read_unlock();
			ip6_del_rt(net, rt);
L
Linus Torvalds 已提交
3599 3600 3601
			goto restart;
		}
	}
3602
	rcu_read_unlock();
3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618

	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
	struct fib6_table *table;
	struct hlist_head *head;
	unsigned int h;

	rcu_read_lock();

	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
		head = &net->ipv6.fib_table_hash[h];
		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3619
				__rt6_purge_dflt_routers(net, table);
3620 3621 3622 3623
		}
	}

	rcu_read_unlock();
L
Linus Torvalds 已提交
3624 3625
}

3626 3627
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
3628 3629
				 struct fib6_config *cfg)
{
3630 3631 3632 3633
	*cfg = (struct fib6_config){
		.fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
			 : RT6_TABLE_MAIN,
		.fc_ifindex = rtmsg->rtmsg_ifindex,
3634
		.fc_metric = rtmsg->rtmsg_metric ? : IP6_RT_PRIO_USER,
3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646
		.fc_expires = rtmsg->rtmsg_info,
		.fc_dst_len = rtmsg->rtmsg_dst_len,
		.fc_src_len = rtmsg->rtmsg_src_len,
		.fc_flags = rtmsg->rtmsg_flags,
		.fc_type = rtmsg->rtmsg_type,

		.fc_nlinfo.nl_net = net,

		.fc_dst = rtmsg->rtmsg_dst,
		.fc_src = rtmsg->rtmsg_src,
		.fc_gateway = rtmsg->rtmsg_gateway,
	};
3647 3648
}

3649
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
3650
{
3651
	struct fib6_config cfg;
L
Linus Torvalds 已提交
3652 3653 3654
	struct in6_rtmsg rtmsg;
	int err;

3655
	switch (cmd) {
L
Linus Torvalds 已提交
3656 3657
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
3658
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3659 3660 3661 3662 3663
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
3664

3665
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3666

L
Linus Torvalds 已提交
3667 3668 3669
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
3670
			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
L
Linus Torvalds 已提交
3671 3672
			break;
		case SIOCDELRT:
3673
			err = ip6_route_del(&cfg, NULL);
L
Linus Torvalds 已提交
3674 3675 3676 3677 3678 3679 3680
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
3681
	}
L
Linus Torvalds 已提交
3682 3683 3684 3685 3686 3687 3688 3689

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

3690
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
3691
{
3692
	int type;
E
Eric Dumazet 已提交
3693
	struct dst_entry *dst = skb_dst(skb);
3694 3695
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
3696
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
3697
		if (type == IPV6_ADDR_ANY) {
3698 3699
			IP6_INC_STATS(dev_net(dst->dev),
				      __in6_dev_get_safely(skb->dev),
3700
				      IPSTATS_MIB_INADDRERRORS);
3701 3702 3703 3704
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
3705 3706
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
3707 3708
		break;
	}
3709
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
3710 3711 3712 3713
	kfree_skb(skb);
	return 0;
}

3714 3715
static int ip6_pkt_discard(struct sk_buff *skb)
{
3716
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3717 3718
}

E
Eric W. Biederman 已提交
3719
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
3720
{
E
Eric Dumazet 已提交
3721
	skb->dev = skb_dst(skb)->dev;
3722
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
3723 3724
}

3725 3726
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
3727
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3728 3729
}

E
Eric W. Biederman 已提交
3730
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3731
{
E
Eric Dumazet 已提交
3732
	skb->dev = skb_dst(skb)->dev;
3733
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3734 3735
}

L
Linus Torvalds 已提交
3736 3737 3738 3739
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

3740 3741 3742 3743
struct fib6_info *addrconf_f6i_alloc(struct net *net,
				     struct inet6_dev *idev,
				     const struct in6_addr *addr,
				     bool anycast, gfp_t gfp_flags)
L
Linus Torvalds 已提交
3744
{
3745 3746 3747 3748 3749 3750 3751 3752 3753 3754
	struct fib6_config cfg = {
		.fc_table = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL,
		.fc_ifindex = idev->dev->ifindex,
		.fc_flags = RTF_UP | RTF_ADDRCONF | RTF_NONEXTHOP,
		.fc_dst = *addr,
		.fc_dst_len = 128,
		.fc_protocol = RTPROT_KERNEL,
		.fc_nlinfo.nl_net = net,
		.fc_ignore_dev_down = true,
	};
L
Linus Torvalds 已提交
3755

3756
	if (anycast) {
3757 3758
		cfg.fc_type = RTN_ANYCAST;
		cfg.fc_flags |= RTF_ANYCAST;
3759
	} else {
3760 3761
		cfg.fc_type = RTN_LOCAL;
		cfg.fc_flags |= RTF_LOCAL;
3762
	}
L
Linus Torvalds 已提交
3763

3764
	return ip6_route_info_create(&cfg, gfp_flags, NULL);
L
Linus Torvalds 已提交
3765 3766
}

3767 3768 3769 3770 3771 3772 3773
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

3774
static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3775 3776 3777 3778 3779
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

D
David Ahern 已提交
3780
	if (((void *)rt->fib6_nh.fib_nh_dev == dev || !dev) &&
D
David Ahern 已提交
3781
	    rt != net->ipv6.fib6_null_entry &&
3782
	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3783
		spin_lock_bh(&rt6_exception_lock);
3784
		/* remove prefsrc entry */
3785
		rt->fib6_prefsrc.plen = 0;
3786
		spin_unlock_bh(&rt6_exception_lock);
3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
3799
	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3800 3801
}

3802
#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT)
3803 3804

/* Remove routers and update dst entries when gateway turn into host. */
3805
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3806 3807 3808
{
	struct in6_addr *gateway = (struct in6_addr *)arg;

3809
	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3810
	    rt->fib6_nh.fib_nh_has_gw &&
D
David Ahern 已提交
3811
	    ipv6_addr_equal(gateway, &rt->fib6_nh.fib_nh_gw6)) {
3812 3813
		return -1;
	}
3814 3815 3816 3817 3818 3819 3820

	/* Further clean up cached routes in exception table.
	 * This is needed because cached route may have a different
	 * gateway than its 'parent' in the case of an ip redirect.
	 */
	rt6_exceptions_clean_tohost(rt, gateway);

3821 3822 3823 3824 3825 3826 3827 3828
	return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
	fib6_clean_all(net, fib6_clean_tohost, gateway);
}

3829 3830
struct arg_netdev_event {
	const struct net_device *dev;
3831 3832 3833 3834
	union {
		unsigned int nh_flags;
		unsigned long event;
	};
3835 3836
};

3837
static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3838
{
3839
	struct fib6_info *iter;
3840 3841
	struct fib6_node *fn;

3842 3843
	fn = rcu_dereference_protected(rt->fib6_node,
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3844
	iter = rcu_dereference_protected(fn->leaf,
3845
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3846
	while (iter) {
3847
		if (iter->fib6_metric == rt->fib6_metric &&
3848
		    rt6_qualify_for_ecmp(iter))
3849
			return iter;
3850
		iter = rcu_dereference_protected(iter->fib6_next,
3851
				lockdep_is_held(&rt->fib6_table->tb6_lock));
3852 3853 3854 3855 3856
	}

	return NULL;
}

3857
static bool rt6_is_dead(const struct fib6_info *rt)
3858
{
D
David Ahern 已提交
3859 3860 3861
	if (rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD ||
	    (rt->fib6_nh.fib_nh_flags & RTNH_F_LINKDOWN &&
	     ip6_ignore_linkdown(rt->fib6_nh.fib_nh_dev)))
3862 3863 3864 3865 3866
		return true;

	return false;
}

3867
static int rt6_multipath_total_weight(const struct fib6_info *rt)
3868
{
3869
	struct fib6_info *iter;
3870 3871 3872
	int total = 0;

	if (!rt6_is_dead(rt))
D
David Ahern 已提交
3873
		total += rt->fib6_nh.fib_nh_weight;
3874

3875
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3876
		if (!rt6_is_dead(iter))
D
David Ahern 已提交
3877
			total += iter->fib6_nh.fib_nh_weight;
3878 3879 3880 3881 3882
	}

	return total;
}

3883
static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3884 3885 3886 3887
{
	int upper_bound = -1;

	if (!rt6_is_dead(rt)) {
D
David Ahern 已提交
3888
		*weight += rt->fib6_nh.fib_nh_weight;
3889 3890 3891
		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
						    total) - 1;
	}
D
David Ahern 已提交
3892
	atomic_set(&rt->fib6_nh.fib_nh_upper_bound, upper_bound);
3893 3894
}

3895
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3896
{
3897
	struct fib6_info *iter;
3898 3899 3900 3901
	int weight = 0;

	rt6_upper_bound_set(rt, &weight, total);

3902
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3903 3904 3905
		rt6_upper_bound_set(iter, &weight, total);
}

3906
void rt6_multipath_rebalance(struct fib6_info *rt)
3907
{
3908
	struct fib6_info *first;
3909 3910 3911 3912 3913 3914
	int total;

	/* In case the entire multipath route was marked for flushing,
	 * then there is no need to rebalance upon the removal of every
	 * sibling route.
	 */
3915
	if (!rt->fib6_nsiblings || rt->should_flush)
3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929
		return;

	/* During lookup routes are evaluated in order, so we need to
	 * make sure upper bounds are assigned from the first sibling
	 * onwards.
	 */
	first = rt6_multipath_first_sibling(rt);
	if (WARN_ON_ONCE(!first))
		return;

	total = rt6_multipath_total_weight(first);
	rt6_multipath_upper_bound_set(first, total);
}

3930
static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3931 3932
{
	const struct arg_netdev_event *arg = p_arg;
3933
	struct net *net = dev_net(arg->dev);
3934

D
David Ahern 已提交
3935 3936 3937
	if (rt != net->ipv6.fib6_null_entry &&
	    rt->fib6_nh.fib_nh_dev == arg->dev) {
		rt->fib6_nh.fib_nh_flags &= ~arg->nh_flags;
3938
		fib6_update_sernum_upto_root(net, rt);
3939
		rt6_multipath_rebalance(rt);
3940
	}
3941 3942 3943 3944 3945 3946 3947 3948

	return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
{
	struct arg_netdev_event arg = {
		.dev = dev,
I
Ido Schimmel 已提交
3949 3950 3951
		{
			.nh_flags = nh_flags,
		},
3952 3953 3954 3955 3956 3957 3958 3959
	};

	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
		arg.nh_flags |= RTNH_F_LINKDOWN;

	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

3960
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3961 3962
				   const struct net_device *dev)
{
3963
	struct fib6_info *iter;
3964

D
David Ahern 已提交
3965
	if (rt->fib6_nh.fib_nh_dev == dev)
3966
		return true;
3967
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
D
David Ahern 已提交
3968
		if (iter->fib6_nh.fib_nh_dev == dev)
3969 3970 3971 3972 3973
			return true;

	return false;
}

3974
static void rt6_multipath_flush(struct fib6_info *rt)
3975
{
3976
	struct fib6_info *iter;
3977 3978

	rt->should_flush = 1;
3979
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3980 3981 3982
		iter->should_flush = 1;
}

3983
static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3984 3985
					     const struct net_device *down_dev)
{
3986
	struct fib6_info *iter;
3987 3988
	unsigned int dead = 0;

D
David Ahern 已提交
3989 3990
	if (rt->fib6_nh.fib_nh_dev == down_dev ||
	    rt->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
3991
		dead++;
3992
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
D
David Ahern 已提交
3993 3994
		if (iter->fib6_nh.fib_nh_dev == down_dev ||
		    iter->fib6_nh.fib_nh_flags & RTNH_F_DEAD)
3995 3996 3997 3998 3999
			dead++;

	return dead;
}

4000
static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4001 4002 4003
				       const struct net_device *dev,
				       unsigned int nh_flags)
{
4004
	struct fib6_info *iter;
4005

D
David Ahern 已提交
4006 4007
	if (rt->fib6_nh.fib_nh_dev == dev)
		rt->fib6_nh.fib_nh_flags |= nh_flags;
4008
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
D
David Ahern 已提交
4009 4010
		if (iter->fib6_nh.fib_nh_dev == dev)
			iter->fib6_nh.fib_nh_flags |= nh_flags;
4011 4012
}

4013
/* called with write lock held for table with rt */
4014
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4015
{
4016 4017
	const struct arg_netdev_event *arg = p_arg;
	const struct net_device *dev = arg->dev;
4018
	struct net *net = dev_net(dev);
4019

D
David Ahern 已提交
4020
	if (rt == net->ipv6.fib6_null_entry)
4021 4022 4023 4024
		return 0;

	switch (arg->event) {
	case NETDEV_UNREGISTER:
D
David Ahern 已提交
4025
		return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4026
	case NETDEV_DOWN:
4027
		if (rt->should_flush)
4028
			return -1;
4029
		if (!rt->fib6_nsiblings)
D
David Ahern 已提交
4030
			return rt->fib6_nh.fib_nh_dev == dev ? -1 : 0;
4031 4032 4033 4034
		if (rt6_multipath_uses_dev(rt, dev)) {
			unsigned int count;

			count = rt6_multipath_dead_count(rt, dev);
4035
			if (rt->fib6_nsiblings + 1 == count) {
4036 4037 4038 4039 4040
				rt6_multipath_flush(rt);
				return -1;
			}
			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
						   RTNH_F_LINKDOWN);
4041
			fib6_update_sernum(net, rt);
4042
			rt6_multipath_rebalance(rt);
4043 4044
		}
		return -2;
4045
	case NETDEV_CHANGE:
D
David Ahern 已提交
4046
		if (rt->fib6_nh.fib_nh_dev != dev ||
4047
		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4048
			break;
D
David Ahern 已提交
4049
		rt->fib6_nh.fib_nh_flags |= RTNH_F_LINKDOWN;
4050
		rt6_multipath_rebalance(rt);
4051
		break;
4052
	}
4053

L
Linus Torvalds 已提交
4054 4055 4056
	return 0;
}

4057
void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
L
Linus Torvalds 已提交
4058
{
4059
	struct arg_netdev_event arg = {
4060
		.dev = dev,
I
Ido Schimmel 已提交
4061 4062 4063
		{
			.event = event,
		},
4064
	};
4065
	struct net *net = dev_net(dev);
4066

4067 4068 4069 4070
	if (net->ipv6.sysctl.skip_notify_on_dev_down)
		fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
	else
		fib6_clean_all(net, fib6_ifdown, &arg);
4071 4072 4073 4074 4075 4076 4077
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
	rt6_sync_down_dev(dev, event);
	rt6_uncached_list_flush_dev(dev_net(dev), dev);
	neigh_ifdown(&nd_tbl, dev);
L
Linus Torvalds 已提交
4078 4079
}

4080
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
4081
	struct net_device *dev;
4082
	unsigned int mtu;
L
Linus Torvalds 已提交
4083 4084
};

4085
static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
4097
	if (!idev)
L
Linus Torvalds 已提交
4098 4099 4100 4101 4102 4103 4104
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
D
David Ahern 已提交
4105
	if (rt->fib6_nh.fib_nh_dev == arg->dev &&
4106 4107 4108 4109 4110 4111 4112
	    !fib6_metric_locked(rt, RTAX_MTU)) {
		u32 mtu = rt->fib6_pmtu;

		if (mtu >= arg->mtu ||
		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
			fib6_metric_set(rt, RTAX_MTU, arg->mtu);

4113
		spin_lock_bh(&rt6_exception_lock);
4114
		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4115
		spin_unlock_bh(&rt6_exception_lock);
4116
	}
L
Linus Torvalds 已提交
4117 4118 4119
	return 0;
}

4120
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
4121
{
T
Thomas Graf 已提交
4122 4123 4124 4125
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
4126

4127
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
L
Linus Torvalds 已提交
4128 4129
}

4130
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4131
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4132
	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4133
	[RTA_OIF]               = { .type = NLA_U32 },
4134
	[RTA_IIF]		= { .type = NLA_U32 },
4135 4136
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
4137
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4138
	[RTA_PREF]              = { .type = NLA_U8 },
4139 4140
	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
	[RTA_ENCAP]		= { .type = NLA_NESTED },
4141
	[RTA_EXPIRES]		= { .type = NLA_U32 },
4142
	[RTA_UID]		= { .type = NLA_U32 },
4143
	[RTA_MARK]		= { .type = NLA_U32 },
4144
	[RTA_TABLE]		= { .type = NLA_U32 },
4145 4146 4147
	[RTA_IP_PROTO]		= { .type = NLA_U8 },
	[RTA_SPORT]		= { .type = NLA_U16 },
	[RTA_DPORT]		= { .type = NLA_U16 },
4148 4149 4150
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4151 4152
			      struct fib6_config *cfg,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4153
{
4154 4155
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
4156
	unsigned int pref;
4157
	int err;
L
Linus Torvalds 已提交
4158

4159
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
D
David Ahern 已提交
4160
			  extack);
4161 4162
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4163

4164 4165 4166
	err = -EINVAL;
	rtm = nlmsg_data(nlh);

4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178
	*cfg = (struct fib6_config){
		.fc_table = rtm->rtm_table,
		.fc_dst_len = rtm->rtm_dst_len,
		.fc_src_len = rtm->rtm_src_len,
		.fc_flags = RTF_UP,
		.fc_protocol = rtm->rtm_protocol,
		.fc_type = rtm->rtm_type,

		.fc_nlinfo.portid = NETLINK_CB(skb).portid,
		.fc_nlinfo.nlh = nlh,
		.fc_nlinfo.nl_net = sock_net(skb->sk),
	};
4179

4180 4181
	if (rtm->rtm_type == RTN_UNREACHABLE ||
	    rtm->rtm_type == RTN_BLACKHOLE ||
4182 4183
	    rtm->rtm_type == RTN_PROHIBIT ||
	    rtm->rtm_type == RTN_THROW)
4184 4185
		cfg->fc_flags |= RTF_REJECT;

4186 4187 4188
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

4189 4190 4191
	if (rtm->rtm_flags & RTM_F_CLONED)
		cfg->fc_flags |= RTF_CACHE;

4192 4193
	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

4194
	if (tb[RTA_GATEWAY]) {
4195
		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4196
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
4197
	}
4198 4199 4200 4201
	if (tb[RTA_VIA]) {
		NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
		goto errout;
	}
4202 4203 4204 4205 4206 4207 4208 4209

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
4210
	}
4211 4212 4213 4214 4215 4216 4217 4218

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
4219
	}
4220

4221
	if (tb[RTA_PREFSRC])
4222
		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4223

4224 4225 4226 4227 4228 4229 4230 4231 4232
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
4233
	}
4234 4235 4236 4237

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

4238 4239 4240
	if (tb[RTA_MULTIPATH]) {
		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4241 4242

		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4243
						     cfg->fc_mp_len, extack);
4244 4245
		if (err < 0)
			goto errout;
4246 4247
	}

4248 4249 4250 4251 4252 4253 4254 4255
	if (tb[RTA_PREF]) {
		pref = nla_get_u8(tb[RTA_PREF]);
		if (pref != ICMPV6_ROUTER_PREF_LOW &&
		    pref != ICMPV6_ROUTER_PREF_HIGH)
			pref = ICMPV6_ROUTER_PREF_MEDIUM;
		cfg->fc_flags |= RTF_PREF(pref);
	}

4256 4257 4258
	if (tb[RTA_ENCAP])
		cfg->fc_encap = tb[RTA_ENCAP];

4259
	if (tb[RTA_ENCAP_TYPE]) {
4260 4261
		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

4262
		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4263 4264 4265 4266
		if (err < 0)
			goto errout;
	}

4267 4268 4269 4270 4271 4272 4273 4274 4275
	if (tb[RTA_EXPIRES]) {
		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

		if (addrconf_finite_timeout(timeout)) {
			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
			cfg->fc_flags |= RTF_EXPIRES;
		}
	}

4276 4277 4278
	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
4279 4280
}

4281
struct rt6_nh {
4282
	struct fib6_info *fib6_info;
4283 4284 4285 4286
	struct fib6_config r_cfg;
	struct list_head next;
};

4287 4288
static int ip6_route_info_append(struct net *net,
				 struct list_head *rt6_nh_list,
4289 4290
				 struct fib6_info *rt,
				 struct fib6_config *r_cfg)
4291 4292 4293 4294 4295
{
	struct rt6_nh *nh;
	int err = -EEXIST;

	list_for_each_entry(nh, rt6_nh_list, next) {
4296 4297
		/* check if fib6_info already exists */
		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4298 4299 4300 4301 4302 4303
			return err;
	}

	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
	if (!nh)
		return -ENOMEM;
4304
	nh->fib6_info = rt;
4305 4306 4307 4308 4309 4310
	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
	list_add_tail(&nh->next, rt6_nh_list);

	return 0;
}

4311 4312
static void ip6_route_mpath_notify(struct fib6_info *rt,
				   struct fib6_info *rt_last,
4313 4314 4315 4316 4317 4318 4319 4320 4321
				   struct nl_info *info,
				   __u16 nlflags)
{
	/* if this is an APPEND route, then rt points to the first route
	 * inserted and rt_last points to last route inserted. Userspace
	 * wants a consistent dump of the route which starts at the first
	 * nexthop. Since sibling routes are always added at the end of
	 * the list, find the first sibling of the last route appended
	 */
4322 4323
	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
		rt = list_first_entry(&rt_last->fib6_siblings,
4324
				      struct fib6_info,
4325
				      fib6_siblings);
4326 4327 4328 4329 4330 4331
	}

	if (rt)
		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

4332 4333
static int ip6_route_multipath_add(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4334
{
4335
	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4336
	struct nl_info *info = &cfg->fc_nlinfo;
4337 4338
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
4339
	struct fib6_info *rt;
4340 4341
	struct rt6_nh *err_nh;
	struct rt6_nh *nh, *nh_safe;
4342
	__u16 nlflags;
4343 4344
	int remaining;
	int attrlen;
4345 4346 4347 4348 4349
	int err = 1;
	int nhn = 0;
	int replace = (cfg->fc_nlinfo.nlh &&
		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
	LIST_HEAD(rt6_nh_list);
4350

4351 4352 4353 4354
	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
		nlflags |= NLM_F_APPEND;

4355
	remaining = cfg->fc_mp_len;
4356 4357
	rtnh = (struct rtnexthop *)cfg->fc_mp;

4358
	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4359
	 * fib6_info structs per nexthop
4360
	 */
4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
4372
				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4373 4374
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
4375 4376 4377 4378
			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				r_cfg.fc_encap_type = nla_get_u16(nla);
4379
		}
4380

4381
		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4382
		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4383 4384 4385
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
4386
			goto cleanup;
4387
		}
4388 4389 4390 4391 4392 4393 4394
		if (!rt6_qualify_for_ecmp(rt)) {
			err = -EINVAL;
			NL_SET_ERR_MSG(extack,
				       "Device only routes can not be added for IPv6 using the multipath API.");
			fib6_info_release(rt);
			goto cleanup;
		}
4395

D
David Ahern 已提交
4396
		rt->fib6_nh.fib_nh_weight = rtnh->rtnh_hops + 1;
4397

4398 4399
		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
					    rt, &r_cfg);
4400
		if (err) {
4401
			fib6_info_release(rt);
4402 4403 4404 4405 4406 4407
			goto cleanup;
		}

		rtnh = rtnh_next(rtnh, &remaining);
	}

4408 4409 4410 4411 4412 4413
	/* for add and replace send one notification with all nexthops.
	 * Skip the notification in fib6_add_rt2node and send one with
	 * the full route when done
	 */
	info->skip_notify = 1;

4414 4415
	err_nh = NULL;
	list_for_each_entry(nh, &rt6_nh_list, next) {
4416 4417
		err = __ip6_ins_rt(nh->fib6_info, info, extack);
		fib6_info_release(nh->fib6_info);
4418

4419 4420 4421 4422 4423 4424 4425 4426
		if (!err) {
			/* save reference to last route successfully inserted */
			rt_last = nh->fib6_info;

			/* save reference to first route for notification */
			if (!rt_notif)
				rt_notif = nh->fib6_info;
		}
4427

4428 4429
		/* nh->fib6_info is used or freed at this point, reset to NULL*/
		nh->fib6_info = NULL;
4430 4431
		if (err) {
			if (replace && nhn)
4432 4433
				NL_SET_ERR_MSG_MOD(extack,
						   "multipath route replace failed (check consistency of installed routes)");
4434 4435
			err_nh = nh;
			goto add_errout;
4436
		}
4437

4438
		/* Because each route is added like a single route we remove
4439 4440 4441 4442 4443
		 * these flags after the first nexthop: if there is a collision,
		 * we have already failed to add the first nexthop:
		 * fib6_add_rt2node() has rejected it; when replacing, old
		 * nexthops have been replaced by first new, the rest should
		 * be added to it.
4444
		 */
4445 4446
		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
						     NLM_F_REPLACE);
4447 4448 4449
		nhn++;
	}

4450 4451
	/* success ... tell user about new route */
	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4452 4453 4454
	goto cleanup;

add_errout:
4455 4456 4457 4458 4459 4460 4461
	/* send notification for routes that were added so that
	 * the delete notifications sent by ip6_route_del are
	 * coherent
	 */
	if (rt_notif)
		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

4462 4463 4464 4465
	/* Delete routes that were already added */
	list_for_each_entry(nh, &rt6_nh_list, next) {
		if (err_nh == nh)
			break;
4466
		ip6_route_del(&nh->r_cfg, extack);
4467 4468 4469 4470
	}

cleanup:
	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4471 4472
		if (nh->fib6_info)
			fib6_info_release(nh->fib6_info);
4473 4474 4475 4476 4477 4478 4479
		list_del(&nh->next);
		kfree(nh);
	}

	return err;
}

4480 4481
static int ip6_route_multipath_del(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507
{
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
	int remaining;
	int attrlen;
	int err = 1, last_err = 0;

	remaining = cfg->fc_mp_len;
	rtnh = (struct rtnexthop *)cfg->fc_mp;

	/* Parse a Multipath Entry */
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
		}
4508
		err = ip6_route_del(&r_cfg, extack);
4509 4510 4511
		if (err)
			last_err = err;

4512 4513 4514 4515 4516 4517
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return last_err;
}

4518 4519
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4520
{
4521 4522
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4523

4524
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4525 4526 4527
	if (err < 0)
		return err;

4528
	if (cfg.fc_mp)
4529
		return ip6_route_multipath_del(&cfg, extack);
4530 4531
	else {
		cfg.fc_delete_all_nh = 1;
4532
		return ip6_route_del(&cfg, extack);
4533
	}
L
Linus Torvalds 已提交
4534 4535
}

4536 4537
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4538
{
4539 4540
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4541

4542
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4543 4544 4545
	if (err < 0)
		return err;

4546 4547 4548
	if (cfg.fc_metric == 0)
		cfg.fc_metric = IP6_RT_PRIO_USER;

4549
	if (cfg.fc_mp)
4550
		return ip6_route_multipath_add(&cfg, extack);
4551
	else
4552
		return ip6_route_add(&cfg, GFP_KERNEL, extack);
L
Linus Torvalds 已提交
4553 4554
}

4555
static size_t rt6_nlmsg_size(struct fib6_info *rt)
4556
{
4557 4558
	int nexthop_len = 0;

4559
	if (rt->fib6_nsiblings) {
4560 4561 4562
		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
			    + NLA_ALIGN(sizeof(struct rtnexthop))
			    + nla_total_size(16) /* RTA_GATEWAY */
D
David Ahern 已提交
4563
			    + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws);
4564

4565
		nexthop_len *= rt->fib6_nsiblings;
4566 4567
	}

4568 4569 4570 4571 4572 4573 4574 4575 4576
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
4577
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4578
	       + nla_total_size(sizeof(struct rta_cacheinfo))
4579
	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4580
	       + nla_total_size(1) /* RTA_PREF */
D
David Ahern 已提交
4581
	       + lwtunnel_get_encap_size(rt->fib6_nh.fib_nh_lws)
4582 4583 4584
	       + nexthop_len;
}

4585
static int rt6_nexthop_info(struct sk_buff *skb, const struct fib6_nh *fib6_nh,
4586
			    unsigned int *flags, bool skip_oif)
4587
{
D
David Ahern 已提交
4588
	if (fib6_nh->fib_nh_flags & RTNH_F_DEAD)
4589 4590
		*flags |= RTNH_F_DEAD;

D
David Ahern 已提交
4591
	if (fib6_nh->fib_nh_flags & RTNH_F_LINKDOWN) {
4592
		*flags |= RTNH_F_LINKDOWN;
D
David Ahern 已提交
4593 4594

		rcu_read_lock();
D
David Ahern 已提交
4595
		if (ip6_ignore_linkdown(fib6_nh->fib_nh_dev))
4596
			*flags |= RTNH_F_DEAD;
D
David Ahern 已提交
4597
		rcu_read_unlock();
4598 4599
	}

4600
	if (fib6_nh->fib_nh_has_gw) {
D
David Ahern 已提交
4601
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &fib6_nh->fib_nh_gw6) < 0)
4602 4603 4604
			goto nla_put_failure;
	}

D
David Ahern 已提交
4605 4606
	*flags |= (fib6_nh->fib_nh_flags & RTNH_F_ONLINK);
	if (fib6_nh->fib_nh_flags & RTNH_F_OFFLOAD)
4607 4608
		*flags |= RTNH_F_OFFLOAD;

4609
	/* not needed for multipath encoding b/c it has a rtnexthop struct */
D
David Ahern 已提交
4610 4611
	if (!skip_oif && fib6_nh->fib_nh_dev &&
	    nla_put_u32(skb, RTA_OIF, fib6_nh->fib_nh_dev->ifindex))
4612 4613
		goto nla_put_failure;

D
David Ahern 已提交
4614 4615
	if (fib6_nh->fib_nh_lws &&
	    lwtunnel_fill_encap(skb, fib6_nh->fib_nh_lws) < 0)
4616 4617 4618 4619 4620 4621 4622 4623
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4624
/* add multipath next hop */
4625
static int rt6_add_nexthop(struct sk_buff *skb, const struct fib6_nh *fib6_nh)
4626
{
D
David Ahern 已提交
4627
	const struct net_device *dev = fib6_nh->fib_nh_dev;
4628 4629 4630 4631 4632 4633 4634
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

D
David Ahern 已提交
4635
	rtnh->rtnh_hops = fib6_nh->fib_nh_weight - 1;
4636
	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4637

4638
	if (rt6_nexthop_info(skb, fib6_nh, &flags, true) < 0)
4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
4650 4651
}

4652
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4653
			 struct fib6_info *rt, struct dst_entry *dst,
4654
			 struct in6_addr *dest, struct in6_addr *src,
4655
			 int iif, int type, u32 portid, u32 seq,
4656
			 unsigned int flags)
L
Linus Torvalds 已提交
4657
{
4658 4659 4660
	struct rt6_info *rt6 = (struct rt6_info *)dst;
	struct rt6key *rt6_dst, *rt6_src;
	u32 *pmetrics, table, rt6_flags;
4661
	struct nlmsghdr *nlh;
4662
	struct rtmsg *rtm;
4663
	long expires = 0;
L
Linus Torvalds 已提交
4664

4665
	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4666
	if (!nlh)
4667
		return -EMSGSIZE;
4668

4669 4670 4671 4672 4673 4674 4675 4676 4677 4678
	if (rt6) {
		rt6_dst = &rt6->rt6i_dst;
		rt6_src = &rt6->rt6i_src;
		rt6_flags = rt6->rt6i_flags;
	} else {
		rt6_dst = &rt->fib6_dst;
		rt6_src = &rt->fib6_src;
		rt6_flags = rt->fib6_flags;
	}

4679
	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
4680
	rtm->rtm_family = AF_INET6;
4681 4682
	rtm->rtm_dst_len = rt6_dst->plen;
	rtm->rtm_src_len = rt6_src->plen;
L
Linus Torvalds 已提交
4683
	rtm->rtm_tos = 0;
4684 4685
	if (rt->fib6_table)
		table = rt->fib6_table->tb6_id;
T
Thomas Graf 已提交
4686
	else
4687
		table = RT6_TABLE_UNSPEC;
4688
	rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
D
David S. Miller 已提交
4689 4690
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
4691 4692

	rtm->rtm_type = rt->fib6_type;
L
Linus Torvalds 已提交
4693 4694
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4695
	rtm->rtm_protocol = rt->fib6_protocol;
L
Linus Torvalds 已提交
4696

4697
	if (rt6_flags & RTF_CACHE)
L
Linus Torvalds 已提交
4698 4699
		rtm->rtm_flags |= RTM_F_CLONED;

4700 4701
	if (dest) {
		if (nla_put_in6_addr(skb, RTA_DST, dest))
D
David S. Miller 已提交
4702
			goto nla_put_failure;
4703
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
4704
	} else if (rtm->rtm_dst_len)
4705
		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
D
David S. Miller 已提交
4706
			goto nla_put_failure;
L
Linus Torvalds 已提交
4707 4708
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
4709
		if (nla_put_in6_addr(skb, RTA_SRC, src))
D
David S. Miller 已提交
4710
			goto nla_put_failure;
4711
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
4712
	} else if (rtm->rtm_src_len &&
4713
		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
D
David S. Miller 已提交
4714
		goto nla_put_failure;
L
Linus Torvalds 已提交
4715
#endif
4716 4717
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
4718
		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4719 4720 4721 4722 4723 4724
			int err = ip6mr_get_route(net, skb, rtm, portid);

			if (err == 0)
				return 0;
			if (err < 0)
				goto nla_put_failure;
4725 4726
		} else
#endif
D
David S. Miller 已提交
4727 4728
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
4729
	} else if (dest) {
L
Linus Torvalds 已提交
4730
		struct in6_addr saddr_buf;
4731
		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4732
		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4733
			goto nla_put_failure;
L
Linus Torvalds 已提交
4734
	}
4735

4736
	if (rt->fib6_prefsrc.plen) {
4737
		struct in6_addr saddr_buf;
4738
		saddr_buf = rt->fib6_prefsrc.addr;
4739
		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4740
			goto nla_put_failure;
4741 4742
	}

4743 4744
	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4745 4746
		goto nla_put_failure;

4747
	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
D
David S. Miller 已提交
4748
		goto nla_put_failure;
4749

4750 4751 4752
	/* For multipath routes, walk the siblings list and add
	 * each as a nexthop within RTA_MULTIPATH.
	 */
4753 4754 4755 4756 4757 4758 4759 4760
	if (rt6) {
		if (rt6_flags & RTF_GATEWAY &&
		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
			goto nla_put_failure;

		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
			goto nla_put_failure;
	} else if (rt->fib6_nsiblings) {
4761
		struct fib6_info *sibling, *next_sibling;
4762 4763 4764 4765 4766 4767
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (!mp)
			goto nla_put_failure;

4768
		if (rt6_add_nexthop(skb, &rt->fib6_nh) < 0)
4769 4770 4771
			goto nla_put_failure;

		list_for_each_entry_safe(sibling, next_sibling,
4772
					 &rt->fib6_siblings, fib6_siblings) {
4773
			if (rt6_add_nexthop(skb, &sibling->fib6_nh) < 0)
4774 4775 4776 4777 4778
				goto nla_put_failure;
		}

		nla_nest_end(skb, mp);
	} else {
4779 4780
		if (rt6_nexthop_info(skb, &rt->fib6_nh, &rtm->rtm_flags,
				     false) < 0)
4781 4782 4783
			goto nla_put_failure;
	}

4784
	if (rt6_flags & RTF_EXPIRES) {
4785 4786 4787
		expires = dst ? dst->expires : rt->expires;
		expires -= jiffies;
	}
4788

4789
	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4790
		goto nla_put_failure;
4791

4792
	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4793 4794
		goto nla_put_failure;

4795

4796 4797
	nlmsg_end(skb, nlh);
	return 0;
4798 4799

nla_put_failure:
4800 4801
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
4802 4803
}

4804 4805 4806
static bool fib6_info_uses_dev(const struct fib6_info *f6i,
			       const struct net_device *dev)
{
D
David Ahern 已提交
4807
	if (f6i->fib6_nh.fib_nh_dev == dev)
4808 4809 4810 4811 4812 4813 4814
		return true;

	if (f6i->fib6_nsiblings) {
		struct fib6_info *sibling, *next_sibling;

		list_for_each_entry_safe(sibling, next_sibling,
					 &f6i->fib6_siblings, fib6_siblings) {
D
David Ahern 已提交
4815
			if (sibling->fib6_nh.fib_nh_dev == dev)
4816 4817 4818 4819 4820 4821 4822
				return true;
		}
	}

	return false;
}

4823
int rt6_dump_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4824 4825
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4826 4827
	struct fib_dump_filter *filter = &arg->filter;
	unsigned int flags = NLM_F_MULTI;
4828 4829
	struct net *net = arg->net;

D
David Ahern 已提交
4830
	if (rt == net->ipv6.fib6_null_entry)
4831
		return 0;
L
Linus Torvalds 已提交
4832

4833 4834 4835 4836 4837 4838 4839 4840 4841
	if ((filter->flags & RTM_F_PREFIX) &&
	    !(rt->fib6_flags & RTF_PREFIX_RT)) {
		/* success since this is not a prefix route */
		return 1;
	}
	if (filter->filter_set) {
		if ((filter->rt_type && rt->fib6_type != filter->rt_type) ||
		    (filter->dev && !fib6_info_uses_dev(rt, filter->dev)) ||
		    (filter->protocol && rt->fib6_protocol != filter->protocol)) {
4842 4843
			return 1;
		}
4844
		flags |= NLM_F_DUMP_FILTERED;
4845
	}
L
Linus Torvalds 已提交
4846

4847 4848
	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4849
			     arg->cb->nlh->nlmsg_seq, flags);
L
Linus Torvalds 已提交
4850 4851
}

4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918
static int inet6_rtm_valid_getroute_req(struct sk_buff *skb,
					const struct nlmsghdr *nlh,
					struct nlattr **tb,
					struct netlink_ext_ack *extack)
{
	struct rtmsg *rtm;
	int i, err;

	if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
		NL_SET_ERR_MSG_MOD(extack,
				   "Invalid header for get route request");
		return -EINVAL;
	}

	if (!netlink_strict_get_check(skb))
		return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
				   rtm_ipv6_policy, extack);

	rtm = nlmsg_data(nlh);
	if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) ||
	    (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) ||
	    rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope ||
	    rtm->rtm_type) {
		NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for get route request");
		return -EINVAL;
	}
	if (rtm->rtm_flags & ~RTM_F_FIB_MATCH) {
		NL_SET_ERR_MSG_MOD(extack,
				   "Invalid flags for get route request");
		return -EINVAL;
	}

	err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
				 rtm_ipv6_policy, extack);
	if (err)
		return err;

	if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
	    (tb[RTA_DST] && !rtm->rtm_dst_len)) {
		NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6");
		return -EINVAL;
	}

	for (i = 0; i <= RTA_MAX; i++) {
		if (!tb[i])
			continue;

		switch (i) {
		case RTA_SRC:
		case RTA_DST:
		case RTA_IIF:
		case RTA_OIF:
		case RTA_MARK:
		case RTA_UID:
		case RTA_SPORT:
		case RTA_DPORT:
		case RTA_IP_PROTO:
			break;
		default:
			NL_SET_ERR_MSG_MOD(extack, "Unsupported attribute in get route request");
			return -EINVAL;
		}
	}

	return 0;
}

4919 4920
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4921
{
4922
	struct net *net = sock_net(in_skb->sk);
4923
	struct nlattr *tb[RTA_MAX+1];
4924
	int err, iif = 0, oif = 0;
4925
	struct fib6_info *from;
4926
	struct dst_entry *dst;
4927
	struct rt6_info *rt;
L
Linus Torvalds 已提交
4928
	struct sk_buff *skb;
4929
	struct rtmsg *rtm;
4930
	struct flowi6 fl6 = {};
4931
	bool fibmatch;
L
Linus Torvalds 已提交
4932

4933
	err = inet6_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
4934 4935
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4936

4937
	err = -EINVAL;
4938 4939
	rtm = nlmsg_data(nlh);
	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4940
	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
L
Linus Torvalds 已提交
4941

4942 4943 4944 4945
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4946
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4947 4948 4949 4950 4951 4952
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4953
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4954 4955 4956 4957 4958 4959
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
4960
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
4961

4962 4963 4964
	if (tb[RTA_MARK])
		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

4965 4966 4967 4968 4969 4970
	if (tb[RTA_UID])
		fl6.flowi6_uid = make_kuid(current_user_ns(),
					   nla_get_u32(tb[RTA_UID]));
	else
		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

4971 4972 4973 4974 4975 4976 4977 4978
	if (tb[RTA_SPORT])
		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);

	if (tb[RTA_DPORT])
		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);

	if (tb[RTA_IP_PROTO]) {
		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4979 4980
						  &fl6.flowi6_proto, AF_INET6,
						  extack);
4981 4982 4983 4984
		if (err)
			goto errout;
	}

L
Linus Torvalds 已提交
4985 4986
	if (iif) {
		struct net_device *dev;
4987 4988
		int flags = 0;

4989 4990 4991
		rcu_read_lock();

		dev = dev_get_by_index_rcu(net, iif);
L
Linus Torvalds 已提交
4992
		if (!dev) {
4993
			rcu_read_unlock();
L
Linus Torvalds 已提交
4994
			err = -ENODEV;
4995
			goto errout;
L
Linus Torvalds 已提交
4996
		}
4997 4998 4999 5000 5001 5002

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

D
David Ahern 已提交
5003
		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5004 5005

		rcu_read_unlock();
5006 5007 5008
	} else {
		fl6.flowi6_oif = oif;

5009
		dst = ip6_route_output(net, NULL, &fl6);
5010 5011 5012 5013 5014 5015 5016 5017
	}


	rt = container_of(dst, struct rt6_info, dst);
	if (rt->dst.error) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
L
Linus Torvalds 已提交
5018 5019
	}

5020 5021 5022 5023 5024 5025
	if (rt == net->ipv6.ip6_null_entry) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
	}

5026
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5027
	if (!skb) {
A
Amerigo Wang 已提交
5028
		ip6_rt_put(rt);
5029 5030 5031
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
5032

5033
	skb_dst_set(skb, &rt->dst);
5034 5035 5036 5037

	rcu_read_lock();
	from = rcu_dereference(rt->from);

5038
	if (fibmatch)
5039
		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
5040 5041 5042
				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
				    nlh->nlmsg_seq, 0);
	else
5043 5044
		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
				    &fl6.saddr, iif, RTM_NEWROUTE,
5045 5046
				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
				    0);
5047 5048
	rcu_read_unlock();

L
Linus Torvalds 已提交
5049
	if (err < 0) {
5050 5051
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
5052 5053
	}

5054
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5055
errout:
L
Linus Torvalds 已提交
5056 5057 5058
	return err;
}

5059
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5060
		     unsigned int nlm_flags)
L
Linus Torvalds 已提交
5061 5062
{
	struct sk_buff *skb;
5063
	struct net *net = info->nl_net;
5064 5065 5066 5067
	u32 seq;
	int err;

	err = -ENOBUFS;
5068
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5069

5070
	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5071
	if (!skb)
5072 5073
		goto errout;

5074 5075
	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
			    event, info->portid, seq, nlm_flags);
5076 5077 5078 5079 5080 5081
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
5082
	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5083 5084
		    info->nlh, gfp_any());
	return;
5085 5086
errout:
	if (err < 0)
5087
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
5088 5089
}

5090
static int ip6_route_dev_notify(struct notifier_block *this,
5091
				unsigned long event, void *ptr)
5092
{
5093
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5094
	struct net *net = dev_net(dev);
5095

5096 5097 5098 5099
	if (!(dev->flags & IFF_LOOPBACK))
		return NOTIFY_OK;

	if (event == NETDEV_REGISTER) {
D
David Ahern 已提交
5100
		net->ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = dev;
5101
		net->ipv6.ip6_null_entry->dst.dev = dev;
5102 5103
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5104
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5105
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5106
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5107
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5108
#endif
5109 5110 5111 5112 5113
	 } else if (event == NETDEV_UNREGISTER &&
		    dev->reg_state != NETREG_UNREGISTERED) {
		/* NETDEV_UNREGISTER could be fired for multiple times by
		 * netdev_wait_allrefs(). Make sure we only call this once.
		 */
5114
		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5115
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5116 5117
		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5118 5119 5120 5121 5122 5123
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
5124 5125 5126 5127 5128 5129 5130
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
5131
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
5132
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5133 5134
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
W
Wei Wang 已提交
5135
		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5136 5137
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
5138
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5139
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
5140 5141 5142 5143 5144 5145 5146 5147

	return 0;
}
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
5148
int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
L
Linus Torvalds 已提交
5149 5150
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
5151 5152
	struct net *net;
	int delay;
5153
	int ret;
5154
	if (!write)
L
Linus Torvalds 已提交
5155
		return -EINVAL;
5156 5157 5158

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
5159 5160 5161 5162
	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
	if (ret)
		return ret;

5163
	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5164
	return 0;
L
Linus Torvalds 已提交
5165 5166
}

5167 5168 5169
static int zero;
static int one = 1;

5170
static struct ctl_table ipv6_route_table_template[] = {
5171
	{
L
Linus Torvalds 已提交
5172
		.procname	=	"flush",
5173
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
5174
		.maxlen		=	sizeof(int),
5175
		.mode		=	0200,
A
Alexey Dobriyan 已提交
5176
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
5177 5178 5179
	},
	{
		.procname	=	"gc_thresh",
5180
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
5181 5182
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5183
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5184 5185 5186
	},
	{
		.procname	=	"max_size",
5187
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
5188 5189
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5190
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5191 5192 5193
	},
	{
		.procname	=	"gc_min_interval",
5194
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5195 5196
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5197
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5198 5199 5200
	},
	{
		.procname	=	"gc_timeout",
5201
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
5202 5203
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5204
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5205 5206 5207
	},
	{
		.procname	=	"gc_interval",
5208
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
5209 5210
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5211
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5212 5213 5214
	},
	{
		.procname	=	"gc_elasticity",
5215
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
5216 5217
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5218
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5219 5220 5221
	},
	{
		.procname	=	"mtu_expires",
5222
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
5223 5224
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5225
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5226 5227 5228
	},
	{
		.procname	=	"min_adv_mss",
5229
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
5230 5231
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5232
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5233 5234 5235
	},
	{
		.procname	=	"gc_min_interval_ms",
5236
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5237 5238
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5239
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
5240
	},
5241 5242 5243 5244 5245 5246 5247 5248 5249
	{
		.procname	=	"skip_notify_on_dev_down",
		.data		=	&init_net.ipv6.sysctl.skip_notify_on_dev_down,
		.maxlen		=	sizeof(int),
		.mode		=	0644,
		.proc_handler	=	proc_dointvec,
		.extra1		=	&zero,
		.extra2		=	&one,
	},
5250
	{ }
L
Linus Torvalds 已提交
5251 5252
};

5253
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5254 5255 5256 5257 5258 5259
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
5260 5261 5262

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
5263
		table[0].extra1 = net;
5264
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5265 5266 5267 5268 5269 5270 5271
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5272
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5273
		table[10].data = &net->ipv6.sysctl.skip_notify_on_dev_down;
5274 5275 5276 5277

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			table[0].procname = NULL;
5278 5279
	}

5280 5281
	return table;
}
L
Linus Torvalds 已提交
5282 5283
#endif

5284
static int __net_init ip6_route_net_init(struct net *net)
5285
{
5286
	int ret = -ENOMEM;
5287

5288 5289
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
5290

5291 5292 5293
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

D
David Ahern 已提交
5294 5295 5296 5297 5298 5299
	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
					    sizeof(*net->ipv6.fib6_null_entry),
					    GFP_KERNEL);
	if (!net->ipv6.fib6_null_entry)
		goto out_ip6_dst_entries;

5300 5301 5302 5303
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
D
David Ahern 已提交
5304
		goto out_fib6_null_entry;
5305
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5306 5307
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
5308 5309

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5310
	net->ipv6.fib6_has_custom_rules = false;
5311 5312 5313
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
5314 5315
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
5316
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5317 5318
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
5319 5320 5321 5322

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
5323 5324
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
5325
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5326 5327
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
5328 5329
#endif

5330 5331 5332 5333 5334 5335 5336 5337
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5338
	net->ipv6.sysctl.skip_notify_on_dev_down = 0;
5339

5340 5341
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

5342 5343 5344
	ret = 0;
out:
	return ret;
5345

5346 5347 5348 5349 5350 5351
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
D
David Ahern 已提交
5352 5353
out_fib6_null_entry:
	kfree(net->ipv6.fib6_null_entry);
5354 5355
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5356 5357
out_ip6_dst_ops:
	goto out;
5358 5359
}

5360
static void __net_exit ip6_route_net_exit(struct net *net)
5361
{
D
David Ahern 已提交
5362
	kfree(net->ipv6.fib6_null_entry);
5363 5364 5365 5366 5367
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
5368
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5369 5370
}

5371 5372 5373
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5374 5375
	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
			sizeof(struct ipv6_route_iter));
5376 5377
	proc_create_net_single("rt6_stats", 0444, net->proc_net,
			rt6_stats_seq_show, NULL);
5378 5379 5380 5381 5382 5383 5384
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5385 5386
	remove_proc_entry("ipv6_route", net->proc_net);
	remove_proc_entry("rt6_stats", net->proc_net);
5387 5388 5389
#endif
}

5390 5391 5392 5393 5394
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
5411
	inetpeer_invalidate_tree(bp);
5412 5413 5414
	kfree(bp);
}

5415
static struct pernet_operations ipv6_inetpeer_ops = {
5416 5417 5418 5419
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

5420 5421 5422 5423 5424
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

5425 5426
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
5427
	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5428 5429
};

5430 5431 5432 5433 5434
void __init ip6_route_init_special_entries(void)
{
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
D
David Ahern 已提交
5435
	init_net.ipv6.fib6_null_entry->fib6_nh.fib_nh_dev = init_net.loopback_dev;
5436 5437 5438 5439 5440 5441 5442 5443 5444 5445
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

5446
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
5447
{
5448
	int ret;
5449
	int cpu;
5450

5451 5452
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
5453
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5454
				  SLAB_HWCACHE_ALIGN, NULL);
5455
	if (!ip6_dst_ops_template.kmem_cachep)
5456
		goto out;
5457

5458
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5459
	if (ret)
5460 5461
		goto out_kmem_cache;

5462 5463
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
5464
		goto out_dst_entries;
5465

5466 5467 5468
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
5469

5470 5471
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

5472
	ret = fib6_init();
5473
	if (ret)
5474
		goto out_register_subsys;
5475 5476 5477

	ret = xfrm6_init();
	if (ret)
5478
		goto out_fib6_init;
5479

5480 5481 5482
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
5483

5484 5485 5486 5487
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501
	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
				   inet6_rtm_newroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
				   inet6_rtm_delroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
				   inet6_rtm_getroute, NULL,
				   RTNL_FLAG_DOIT_UNLOCKED);
	if (ret < 0)
5502
		goto out_register_late_subsys;
5503

5504
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5505
	if (ret)
5506
		goto out_register_late_subsys;
5507

5508 5509 5510 5511 5512 5513 5514
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

		INIT_LIST_HEAD(&ul->head);
		spin_lock_init(&ul->lock);
	}

5515 5516 5517
out:
	return ret;

5518
out_register_late_subsys:
5519
	rtnl_unregister_all(PF_INET6);
5520
	unregister_pernet_subsys(&ip6_route_net_late_ops);
5521 5522 5523 5524
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
5525 5526
out_fib6_init:
	fib6_gc_cleanup();
5527 5528
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
5529 5530
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5531 5532
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5533
out_kmem_cache:
5534
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5535
	goto out;
L
Linus Torvalds 已提交
5536 5537 5538 5539
}

void ip6_route_cleanup(void)
{
5540
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5541
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
5542
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
5543 5544
	xfrm6_fini();
	fib6_gc_cleanup();
5545
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5546
	unregister_pernet_subsys(&ip6_route_net_ops);
5547
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5548
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
5549
}