route.c 98.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53 54 55 56
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
57
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
58
#include <net/xfrm.h>
59
#include <net/netevent.h>
60
#include <net/netlink.h>
61
#include <net/nexthop.h>
62
#include <net/lwtunnel.h>
63
#include <net/ip_tunnels.h>
D
David Ahern 已提交
64
#include <net/l3mdev.h>
D
David Ahern 已提交
65
#include <trace/events/fib6.h>
L
Linus Torvalds 已提交
66

67
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
68 69 70 71 72

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

73
enum rt6_nud_state {
J
Jiri Benc 已提交
74 75 76
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
77 78 79
	RT6_NUD_SUCCEED = 1
};

M
Martin KaFai Lau 已提交
80
static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
L
Linus Torvalds 已提交
81
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
84 85 86 87
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
88
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
89 90

static int		ip6_pkt_discard(struct sk_buff *skb);
E
Eric W. Biederman 已提交
91
static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92
static int		ip6_pkt_prohibit(struct sk_buff *skb);
E
Eric W. Biederman 已提交
93
static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
L
Linus Torvalds 已提交
94
static void		ip6_link_failure(struct sk_buff *skb);
95 96 97 98
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
99
static void		rt6_dst_from_metrics_check(struct rt6_info *rt);
100
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101 102 103 104 105 106
static size_t rt6_nlmsg_size(struct rt6_info *rt);
static int rt6_fill_node(struct net *net,
			 struct sk_buff *skb, struct rt6_info *rt,
			 struct in6_addr *dst, struct in6_addr *src,
			 int iif, int type, u32 portid, u32 seq,
			 unsigned int flags);
L
Linus Torvalds 已提交
107

108
#ifdef CONFIG_IPV6_ROUTE_INFO
109
static struct rt6_info *rt6_add_route_info(struct net *net,
110
					   const struct in6_addr *prefix, int prefixlen,
111 112
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
113
					   unsigned int pref);
114
static struct rt6_info *rt6_get_route_info(struct net *net,
115
					   const struct in6_addr *prefix, int prefixlen,
116 117
					   const struct in6_addr *gwaddr,
					   struct net_device *dev);
118 119
#endif

120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
struct uncached_list {
	spinlock_t		lock;
	struct list_head	head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

static void rt6_uncached_list_add(struct rt6_info *rt)
{
	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

	rt->dst.flags |= DST_NOCACHE;
	rt->rt6i_uncached_list = ul;

	spin_lock_bh(&ul->lock);
	list_add_tail(&rt->rt6i_uncached, &ul->head);
	spin_unlock_bh(&ul->lock);
}

static void rt6_uncached_list_del(struct rt6_info *rt)
{
	if (!list_empty(&rt->rt6i_uncached)) {
		struct uncached_list *ul = rt->rt6i_uncached_list;

		spin_lock_bh(&ul->lock);
		list_del(&rt->rt6i_uncached);
		spin_unlock_bh(&ul->lock);
	}
}

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
	struct net_device *loopback_dev = net->loopback_dev;
	int cpu;

155 156 157
	if (dev == loopback_dev)
		return;

158 159 160 161 162 163 164 165 166
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
		struct rt6_info *rt;

		spin_lock_bh(&ul->lock);
		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
			struct inet6_dev *rt_idev = rt->rt6i_idev;
			struct net_device *rt_dev = rt->dst.dev;

167
			if (rt_idev->dev == dev) {
168 169 170 171
				rt->rt6i_idev = in6_dev_get(loopback_dev);
				in6_dev_put(rt_idev);
			}

172
			if (rt_dev == dev) {
173 174 175 176 177 178 179 180 181
				rt->dst.dev = loopback_dev;
				dev_hold(rt->dst.dev);
				dev_put(rt_dev);
			}
		}
		spin_unlock_bh(&ul->lock);
	}
}

M
Martin KaFai Lau 已提交
182 183 184 185 186
static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
{
	return dst_metrics_write_ptr(rt->dst.from);
}

187 188
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
189
	struct rt6_info *rt = (struct rt6_info *)dst;
190

M
Martin KaFai Lau 已提交
191 192 193
	if (rt->rt6i_flags & RTF_PCPU)
		return rt6_pcpu_cow_metrics(rt);
	else if (rt->rt6i_flags & RTF_CACHE)
194 195
		return NULL;
	else
196
		return dst_cow_metrics_generic(dst, old);
197 198
}

199 200 201
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
					     struct sk_buff *skb,
					     const void *daddr)
202 203 204
{
	struct in6_addr *p = &rt->rt6i_gateway;

D
David S. Miller 已提交
205
	if (!ipv6_addr_any(p))
206
		return (const void *) p;
207 208
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
209 210 211
	return daddr;
}

212 213 214
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
					  struct sk_buff *skb,
					  const void *daddr)
215
{
216 217 218
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct neighbour *n;

219
	daddr = choose_neigh_daddr(rt, skb, daddr);
220
	n = __ipv6_neigh_lookup(dst->dev, daddr);
221 222 223 224 225
	if (n)
		return n;
	return neigh_create(&nd_tbl, daddr, dst->dev);
}

226
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
227 228 229 230
	.family			=	AF_INET6,
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
231
	.default_advmss		=	ip6_default_advmss,
232
	.mtu			=	ip6_mtu,
233
	.cow_metrics		=	ipv6_cow_metrics,
L
Linus Torvalds 已提交
234 235 236 237 238
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
239
	.redirect		=	rt6_do_redirect,
240
	.local_out		=	__ip6_local_out,
241
	.neigh_lookup		=	ip6_neigh_lookup,
L
Linus Torvalds 已提交
242 243
};

244
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
245
{
246 247 248
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
249 250
}

251 252
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
253 254 255
{
}

256 257
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
258 259 260
{
}

261 262 263 264
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
265
	.mtu			=	ip6_blackhole_mtu,
266
	.default_advmss		=	ip6_default_advmss,
267
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
268
	.redirect		=	ip6_rt_blackhole_redirect,
269
	.cow_metrics		=	dst_cow_metrics_generic,
270
	.neigh_lookup		=	ip6_neigh_lookup,
271 272
};

273
static const u32 ip6_template_metrics[RTAX_MAX] = {
L
Li RongQing 已提交
274
	[RTAX_HOPLIMIT - 1] = 0,
275 276
};

277
static const struct rt6_info ip6_null_entry_template = {
278 279 280
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
281
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
282 283 284
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
285 286
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
287
	.rt6i_protocol  = RTPROT_KERNEL,
L
Linus Torvalds 已提交
288 289 290 291
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

T
Thomas Graf 已提交
292 293
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

294
static const struct rt6_info ip6_prohibit_entry_template = {
295 296 297
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
298
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
299 300 301
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
302 303
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
304
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
305 306 307 308
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

309
static const struct rt6_info ip6_blk_hole_entry_template = {
310 311 312
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
313
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 315
		.error		= -EINVAL,
		.input		= dst_discard,
E
Eric W. Biederman 已提交
316
		.output		= dst_discard_out,
T
Thomas Graf 已提交
317 318
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
319
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
320 321 322 323 324 325
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

#endif

326 327 328 329 330 331 332 333 334
static void rt6_info_init(struct rt6_info *rt)
{
	struct dst_entry *dst = &rt->dst;

	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
	INIT_LIST_HEAD(&rt->rt6i_siblings);
	INIT_LIST_HEAD(&rt->rt6i_uncached);
}

L
Linus Torvalds 已提交
335
/* allocate dst with ip6_dst_ops */
M
Martin KaFai Lau 已提交
336 337
static struct rt6_info *__ip6_dst_alloc(struct net *net,
					struct net_device *dev,
338
					int flags)
L
Linus Torvalds 已提交
339
{
340
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
341
					0, DST_OBSOLETE_FORCE_CHK, flags);
342

343 344
	if (rt)
		rt6_info_init(rt);
345

346
	return rt;
L
Linus Torvalds 已提交
347 348
}

349 350 351
struct rt6_info *ip6_dst_alloc(struct net *net,
			       struct net_device *dev,
			       int flags)
M
Martin KaFai Lau 已提交
352
{
353
	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
M
Martin KaFai Lau 已提交
354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374

	if (rt) {
		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
		if (rt->rt6i_pcpu) {
			int cpu;

			for_each_possible_cpu(cpu) {
				struct rt6_info **p;

				p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
				/* no one shares rt */
				*p =  NULL;
			}
		} else {
			dst_destroy((struct dst_entry *)rt);
			return NULL;
		}
	}

	return rt;
}
375
EXPORT_SYMBOL(ip6_dst_alloc);
M
Martin KaFai Lau 已提交
376

L
Linus Torvalds 已提交
377 378 379
static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
380
	struct dst_entry *from = dst->from;
381
	struct inet6_dev *idev;
L
Linus Torvalds 已提交
382

383
	dst_destroy_metrics_generic(dst);
384
	free_percpu(rt->rt6i_pcpu);
385 386 387
	rt6_uncached_list_del(rt);

	idev = rt->rt6i_idev;
388
	if (idev) {
L
Linus Torvalds 已提交
389 390
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
391
	}
392

393 394
	dst->from = NULL;
	dst_release(from);
395 396
}

L
Linus Torvalds 已提交
397 398 399 400 401
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
402
	struct net_device *loopback_dev =
403
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
404

405 406 407 408 409 410 411 412 413
	if (dev != loopback_dev) {
		if (idev && idev->dev == dev) {
			struct inet6_dev *loopback_idev =
				in6_dev_get(loopback_dev);
			if (loopback_idev) {
				rt->rt6i_idev = loopback_idev;
				in6_dev_put(idev);
			}
		}
L
Linus Torvalds 已提交
414 415 416
	}
}

417 418 419 420 421 422 423 424
static bool __rt6_check_expired(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_EXPIRES)
		return time_after(jiffies, rt->dst.expires);
	else
		return false;
}

425
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
426
{
427 428
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
429
			return true;
430
	} else if (rt->dst.from) {
431
		return rt6_check_expired((struct rt6_info *) rt->dst.from);
432
	}
433
	return false;
L
Linus Torvalds 已提交
434 435
}

436 437 438 439 440 441 442
/* Multipath route selection:
 *   Hash based function using packet header and flowlabel.
 * Adapted from fib_info_hashfn()
 */
static int rt6_info_hash_nhsfn(unsigned int candidate_count,
			       const struct flowi6 *fl6)
{
443
	return get_hash_from_flowi6(fl6) % candidate_count;
444 445 446
}

static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
447 448
					     struct flowi6 *fl6, int oif,
					     int strict)
449 450 451 452 453 454 455 456 457 458 459 460 461
{
	struct rt6_info *sibling, *next_sibling;
	int route_choosen;

	route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
	/* Don't change the route, if route_choosen == 0
	 * (siblings does not include ourself)
	 */
	if (route_choosen)
		list_for_each_entry_safe(sibling, next_sibling,
				&match->rt6i_siblings, rt6i_siblings) {
			route_choosen--;
			if (route_choosen == 0) {
462 463
				if (rt6_score_route(sibling, oif, strict) < 0)
					break;
464 465 466 467 468 469 470
				match = sibling;
				break;
			}
		}
	return match;
}

L
Linus Torvalds 已提交
471
/*
T
Thomas Graf 已提交
472
 *	Route lookup. Any table->tb6_lock is implied.
L
Linus Torvalds 已提交
473 474
 */

475 476
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
477
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
478
						    int oif,
479
						    int flags)
L
Linus Torvalds 已提交
480 481 482 483
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

484 485 486
	if (!oif && ipv6_addr_any(saddr))
		goto out;

487
	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
488
		struct net_device *dev = sprt->dst.dev;
489 490

		if (oif) {
L
Linus Torvalds 已提交
491 492 493
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
494
				if (!sprt->rt6i_idev ||
L
Linus Torvalds 已提交
495
				    sprt->rt6i_idev->dev->ifindex != oif) {
496
					if (flags & RT6_LOOKUP_F_IFACE)
L
Linus Torvalds 已提交
497
						continue;
498 499
					if (local &&
					    local->rt6i_idev->dev->ifindex == oif)
L
Linus Torvalds 已提交
500 501 502 503
						continue;
				}
				local = sprt;
			}
504 505 506 507
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
508
		}
509
	}
L
Linus Torvalds 已提交
510

511
	if (oif) {
L
Linus Torvalds 已提交
512 513 514
		if (local)
			return local;

515
		if (flags & RT6_LOOKUP_F_IFACE)
516
			return net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
517
	}
518
out:
L
Linus Torvalds 已提交
519 520 521
	return rt;
}

522
#ifdef CONFIG_IPV6_ROUTER_PREF
523 524 525 526 527 528 529 530 531 532 533 534 535
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
536
	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
537
	dev_put(work->dev);
538
	kfree(work);
539 540
}

541 542
static void rt6_probe(struct rt6_info *rt)
{
543
	struct __rt6_probe_work *work;
544
	struct neighbour *neigh;
545 546 547 548 549 550 551 552
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
553
	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
554
		return;
555 556 557
	rcu_read_lock_bh();
	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
	if (neigh) {
558 559 560
		if (neigh->nud_state & NUD_VALID)
			goto out;

561
		work = NULL;
562
		write_lock(&neigh->lock);
563 564 565 566 567 568 569
		if (!(neigh->nud_state & NUD_VALID) &&
		    time_after(jiffies,
			       neigh->updated +
			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
			work = kmalloc(sizeof(*work), GFP_ATOMIC);
			if (work)
				__neigh_set_probe_once(neigh);
570
		}
571
		write_unlock(&neigh->lock);
572 573
	} else {
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
574
	}
575 576 577 578 579 580 581 582 583

	if (work) {
		INIT_WORK(&work->work, rt6_probe_deferred);
		work->target = rt->rt6i_gateway;
		dev_hold(rt->dst.dev);
		work->dev = rt->dst.dev;
		schedule_work(&work->work);
	}

584
out:
585
	rcu_read_unlock_bh();
586 587 588 589 590 591 592
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

L
Linus Torvalds 已提交
593
/*
594
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
595
 */
D
Dave Jones 已提交
596
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
597
{
598
	struct net_device *dev = rt->dst.dev;
599
	if (!oif || dev->ifindex == oif)
600
		return 2;
601 602 603 604
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
605
}
L
Linus Torvalds 已提交
606

607
static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
L
Linus Torvalds 已提交
608
{
609
	struct neighbour *neigh;
610
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
611

612 613
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
614
		return RT6_NUD_SUCCEED;
615 616 617 618 619

	rcu_read_lock_bh();
	neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
	if (neigh) {
		read_lock(&neigh->lock);
620
		if (neigh->nud_state & NUD_VALID)
621
			ret = RT6_NUD_SUCCEED;
622
#ifdef CONFIG_IPV6_ROUTER_PREF
623
		else if (!(neigh->nud_state & NUD_FAILED))
624
			ret = RT6_NUD_SUCCEED;
J
Jiri Benc 已提交
625 626
		else
			ret = RT6_NUD_FAIL_PROBE;
627
#endif
628
		read_unlock(&neigh->lock);
629 630
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
J
Jiri Benc 已提交
631
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
632
	}
633 634
	rcu_read_unlock_bh();

635
	return ret;
L
Linus Torvalds 已提交
636 637
}

638 639
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
L
Linus Torvalds 已提交
640
{
641
	int m;
642

643
	m = rt6_check_dev(rt, oif);
644
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
645
		return RT6_NUD_FAIL_HARD;
646 647 648
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
649 650 651 652 653
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
654 655 656
	return m;
}

657
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
658 659
				   int *mpri, struct rt6_info *match,
				   bool *do_rr)
660
{
661
	int m;
662
	bool match_do_rr = false;
663 664 665 666
	struct inet6_dev *idev = rt->rt6i_idev;
	struct net_device *dev = rt->dst.dev;

	if (dev && !netif_carrier_ok(dev) &&
667 668
	    idev->cnf.ignore_routes_with_linkdown &&
	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
669
		goto out;
670 671 672 673 674

	if (rt6_check_expired(rt))
		goto out;

	m = rt6_score_route(rt, oif, strict);
J
Jiri Benc 已提交
675
	if (m == RT6_NUD_FAIL_DO_RR) {
676 677
		match_do_rr = true;
		m = 0; /* lowest valid score */
J
Jiri Benc 已提交
678
	} else if (m == RT6_NUD_FAIL_HARD) {
679
		goto out;
680 681 682 683
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
684

J
Jiri Benc 已提交
685
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
686
	if (m > *mpri) {
687
		*do_rr = match_do_rr;
688 689 690 691 692 693 694 695 696
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
				     struct rt6_info *rr_head,
697 698
				     u32 metric, int oif, int strict,
				     bool *do_rr)
699
{
700
	struct rt6_info *rt, *match, *cont;
701
	int mpri = -1;
L
Linus Torvalds 已提交
702

703
	match = NULL;
704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719
	cont = NULL;
	for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
		if (rt->rt6i_metric != metric) {
			cont = rt;
			break;
		}

		match = find_match(rt, oif, strict, &mpri, match, do_rr);
	}

	for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
		if (rt->rt6i_metric != metric) {
			cont = rt;
			break;
		}

720
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
721 722 723 724 725 726
	}

	if (match || !cont)
		return match;

	for (rt = cont; rt; rt = rt->dst.rt6_next)
727
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
L
Linus Torvalds 已提交
728

729 730
	return match;
}
L
Linus Torvalds 已提交
731

732 733 734
static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
{
	struct rt6_info *match, *rt0;
735
	struct net *net;
736
	bool do_rr = false;
L
Linus Torvalds 已提交
737

738 739 740
	rt0 = fn->rr_ptr;
	if (!rt0)
		fn->rr_ptr = rt0 = fn->leaf;
L
Linus Torvalds 已提交
741

742 743
	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
			     &do_rr);
L
Linus Torvalds 已提交
744

745
	if (do_rr) {
746
		struct rt6_info *next = rt0->dst.rt6_next;
747

748
		/* no entries matched; do round-robin */
749 750 751 752 753
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
			next = fn->leaf;

		if (next != rt0)
			fn->rr_ptr = next;
L
Linus Torvalds 已提交
754 755
	}

756
	net = dev_net(rt0->dst.dev);
E
Eric Dumazet 已提交
757
	return match ? match : net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
758 759
}

760 761 762 763 764
static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
{
	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
}

765 766
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
767
		  const struct in6_addr *gwaddr)
768
{
769
	struct net *net = dev_net(dev);
770 771 772
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
773
	unsigned long lifetime;
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
797
		return -EINVAL;
798

799
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
800 801 802 803 804 805 806 807 808 809 810

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

811 812 813 814
	if (rinfo->prefix_len == 0)
		rt = rt6_get_dflt_router(gwaddr, dev);
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
815
					gwaddr, dev);
816 817

	if (rt && !lifetime) {
818
		ip6_del_rt(rt);
819 820 821 822
		rt = NULL;
	}

	if (!rt && lifetime)
823 824
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
					dev, pref);
825 826 827 828 829
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
830 831 832 833 834
		if (!addrconf_finite_timeout(lifetime))
			rt6_clean_expires(rt);
		else
			rt6_set_expires(rt, jiffies + HZ * lifetime);

A
Amerigo Wang 已提交
835
		ip6_rt_put(rt);
836 837 838 839 840
	}
	return 0;
}
#endif

M
Martin KaFai Lau 已提交
841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
					struct in6_addr *saddr)
{
	struct fib6_node *pn;
	while (1) {
		if (fn->fn_flags & RTN_TL_ROOT)
			return NULL;
		pn = fn->parent;
		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
		else
			fn = pn;
		if (fn->fn_flags & RTN_RTINFO)
			return fn;
	}
}
T
Thomas Graf 已提交
857

858 859
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
860
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
861 862 863 864
{
	struct fib6_node *fn;
	struct rt6_info *rt;

T
Thomas Graf 已提交
865
	read_lock_bh(&table->tb6_lock);
866
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
867 868
restart:
	rt = fn->leaf;
869
	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
870
	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
871
		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
M
Martin KaFai Lau 已提交
872 873 874 875 876
	if (rt == net->ipv6.ip6_null_entry) {
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
	}
877
	dst_use(&rt->dst, jiffies);
T
Thomas Graf 已提交
878
	read_unlock_bh(&table->tb6_lock);
D
David Ahern 已提交
879 880 881

	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);

T
Thomas Graf 已提交
882 883 884 885
	return rt;

}

886
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
F
Florian Westphal 已提交
887 888 889 890 891 892
				    int flags)
{
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

893 894
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
			    const struct in6_addr *saddr, int oif, int strict)
T
Thomas Graf 已提交
895
{
896 897 898
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
899 900
	};
	struct dst_entry *dst;
901
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
902

903
	if (saddr) {
904
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
905 906 907
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

908
	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
909 910 911 912 913
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
914 915
	return NULL;
}
916 917
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
918
/* ip6_ins_rt is called with FREE table->tb6_lock.
L
Linus Torvalds 已提交
919 920 921 922 923
   It takes new route entry, the addition fails by any reason the
   route is freed. In any case, if caller does not hold it, it may
   be destroyed.
 */

924
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
925
			struct mx6_config *mxc)
L
Linus Torvalds 已提交
926 927
{
	int err;
T
Thomas Graf 已提交
928
	struct fib6_table *table;
L
Linus Torvalds 已提交
929

T
Thomas Graf 已提交
930 931
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
932
	err = fib6_add(&table->tb6_root, rt, info, mxc);
T
Thomas Graf 已提交
933
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
934 935 936 937

	return err;
}

938 939
int ip6_ins_rt(struct rt6_info *rt)
{
940 941 942 943
	struct nl_info info = {	.nl_net = dev_net(rt->dst.dev), };
	struct mx6_config mxc = { .mx = NULL, };

	return __ip6_ins_rt(rt, &info, &mxc);
944 945
}

946 947 948
static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
					   const struct in6_addr *daddr,
					   const struct in6_addr *saddr)
L
Linus Torvalds 已提交
949 950 951 952 953 954 955
{
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

M
Martin KaFai Lau 已提交
956
	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
M
Martin KaFai Lau 已提交
957
		ort = (struct rt6_info *)ort->dst.from;
L
Linus Torvalds 已提交
958

959
	rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
M
Martin KaFai Lau 已提交
960 961 962 963 964 965 966 967 968 969

	if (!rt)
		return NULL;

	ip6_rt_copy_init(rt, ort);
	rt->rt6i_flags |= RTF_CACHE;
	rt->rt6i_metric = 0;
	rt->dst.flags |= DST_HOST;
	rt->rt6i_dst.addr = *daddr;
	rt->rt6i_dst.plen = 128;
L
Linus Torvalds 已提交
970

M
Martin KaFai Lau 已提交
971 972 973 974
	if (!rt6_is_gw_or_nonexthop(ort)) {
		if (ort->rt6i_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
			rt->rt6i_flags |= RTF_ANYCAST;
L
Linus Torvalds 已提交
975
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
976 977 978
		if (rt->rt6i_src.plen && saddr) {
			rt->rt6i_src.addr = *saddr;
			rt->rt6i_src.plen = 128;
979
		}
M
Martin KaFai Lau 已提交
980
#endif
981
	}
L
Linus Torvalds 已提交
982

983 984
	return rt;
}
L
Linus Torvalds 已提交
985

M
Martin KaFai Lau 已提交
986 987 988 989 990
static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
	struct rt6_info *pcpu_rt;

	pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
991
				  rt->dst.dev, rt->dst.flags);
M
Martin KaFai Lau 已提交
992 993 994 995 996 997 998 999 1000 1001 1002 1003

	if (!pcpu_rt)
		return NULL;
	ip6_rt_copy_init(pcpu_rt, rt);
	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
	pcpu_rt->rt6i_flags |= RTF_PCPU;
	return pcpu_rt;
}

/* It should be called with read_lock_bh(&tb6_lock) acquired */
static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
{
1004
	struct rt6_info *pcpu_rt, **p;
M
Martin KaFai Lau 已提交
1005 1006 1007 1008

	p = this_cpu_ptr(rt->rt6i_pcpu);
	pcpu_rt = *p;

1009 1010 1011 1012 1013 1014 1015 1016 1017
	if (pcpu_rt) {
		dst_hold(&pcpu_rt->dst);
		rt6_dst_from_metrics_check(pcpu_rt);
	}
	return pcpu_rt;
}

static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
{
1018
	struct fib6_table *table = rt->rt6i_table;
1019
	struct rt6_info *pcpu_rt, *prev, **p;
M
Martin KaFai Lau 已提交
1020 1021 1022 1023 1024

	pcpu_rt = ip6_rt_pcpu_alloc(rt);
	if (!pcpu_rt) {
		struct net *net = dev_net(rt->dst.dev);

1025 1026
		dst_hold(&net->ipv6.ip6_null_entry->dst);
		return net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1027 1028
	}

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
	read_lock_bh(&table->tb6_lock);
	if (rt->rt6i_pcpu) {
		p = this_cpu_ptr(rt->rt6i_pcpu);
		prev = cmpxchg(p, NULL, pcpu_rt);
		if (prev) {
			/* If someone did it before us, return prev instead */
			dst_destroy(&pcpu_rt->dst);
			pcpu_rt = prev;
		}
	} else {
		/* rt has been removed from the fib6 tree
		 * before we have a chance to acquire the read_lock.
		 * In this case, don't brother to create a pcpu rt
		 * since rt is going away anyway.  The next
		 * dst_check() will trigger a re-lookup.
		 */
M
Martin KaFai Lau 已提交
1045
		dst_destroy(&pcpu_rt->dst);
1046
		pcpu_rt = rt;
M
Martin KaFai Lau 已提交
1047 1048 1049
	}
	dst_hold(&pcpu_rt->dst);
	rt6_dst_from_metrics_check(pcpu_rt);
1050
	read_unlock_bh(&table->tb6_lock);
M
Martin KaFai Lau 已提交
1051 1052 1053
	return pcpu_rt;
}

1054 1055
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
			       int oif, struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
1056
{
1057
	struct fib6_node *fn, *saved_fn;
1058
	struct rt6_info *rt;
T
Thomas Graf 已提交
1059
	int strict = 0;
L
Linus Torvalds 已提交
1060

1061
	strict |= flags & RT6_LOOKUP_F_IFACE;
1062
	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1063 1064
	if (net->ipv6.devconf_all->forwarding == 0)
		strict |= RT6_LOOKUP_F_REACHABLE;
L
Linus Torvalds 已提交
1065

T
Thomas Graf 已提交
1066
	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1067

1068
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1069
	saved_fn = fn;
L
Linus Torvalds 已提交
1070

D
David Ahern 已提交
1071 1072 1073
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		oif = 0;

M
Martin KaFai Lau 已提交
1074
redo_rt6_select:
1075
	rt = rt6_select(fn, oif, strict);
1076
	if (rt->rt6i_nsiblings)
1077
		rt = rt6_multipath_select(rt, fl6, oif, strict);
M
Martin KaFai Lau 已提交
1078 1079 1080 1081
	if (rt == net->ipv6.ip6_null_entry) {
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto redo_rt6_select;
1082 1083 1084 1085 1086 1087
		else if (strict & RT6_LOOKUP_F_REACHABLE) {
			/* also consider unreachable route */
			strict &= ~RT6_LOOKUP_F_REACHABLE;
			fn = saved_fn;
			goto redo_rt6_select;
		}
M
Martin KaFai Lau 已提交
1088 1089
	}

1090

1091
	if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
M
Martin KaFai Lau 已提交
1092 1093 1094 1095
		dst_use(&rt->dst, jiffies);
		read_unlock_bh(&table->tb6_lock);

		rt6_dst_from_metrics_check(rt);
D
David Ahern 已提交
1096 1097

		trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
M
Martin KaFai Lau 已提交
1098
		return rt;
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108
	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
			    !(rt->rt6i_flags & RTF_GATEWAY))) {
		/* Create a RTF_CACHE clone which will not be
		 * owned by the fib6 tree.  It is for the special case where
		 * the daddr in the skb during the neighbor look-up is different
		 * from the fl6->daddr used to look-up route here.
		 */

		struct rt6_info *uncached_rt;

M
Martin KaFai Lau 已提交
1109 1110 1111
		dst_use(&rt->dst, jiffies);
		read_unlock_bh(&table->tb6_lock);

1112 1113
		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
		dst_release(&rt->dst);
T
Thomas Graf 已提交
1114

1115
		if (uncached_rt)
1116
			rt6_uncached_list_add(uncached_rt);
1117 1118
		else
			uncached_rt = net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1119

1120
		dst_hold(&uncached_rt->dst);
D
David Ahern 已提交
1121 1122

		trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1123 1124
		return uncached_rt;

M
Martin KaFai Lau 已提交
1125 1126 1127 1128 1129 1130 1131 1132 1133
	} else {
		/* Get a percpu copy */

		struct rt6_info *pcpu_rt;

		rt->dst.lastuse = jiffies;
		rt->dst.__use++;
		pcpu_rt = rt6_get_pcpu_route(rt);

1134 1135 1136 1137 1138 1139 1140 1141 1142
		if (pcpu_rt) {
			read_unlock_bh(&table->tb6_lock);
		} else {
			/* We have to do the read_unlock first
			 * because rt6_make_pcpu_route() may trigger
			 * ip6_dst_gc() which will take the write_lock.
			 */
			dst_hold(&rt->dst);
			read_unlock_bh(&table->tb6_lock);
1143
			pcpu_rt = rt6_make_pcpu_route(rt);
1144 1145
			dst_release(&rt->dst);
		}
M
Martin KaFai Lau 已提交
1146

D
David Ahern 已提交
1147
		trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
M
Martin KaFai Lau 已提交
1148
		return pcpu_rt;
1149

M
Martin KaFai Lau 已提交
1150
	}
L
Linus Torvalds 已提交
1151
}
1152
EXPORT_SYMBOL_GPL(ip6_pol_route);
L
Linus Torvalds 已提交
1153

1154
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1155
					    struct flowi6 *fl6, int flags)
1156
{
1157
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1158 1159
}

1160 1161 1162
struct dst_entry *ip6_route_input_lookup(struct net *net,
					 struct net_device *dev,
					 struct flowi6 *fl6, int flags)
1163 1164 1165 1166 1167 1168
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}
1169
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1170

T
Thomas Graf 已提交
1171 1172
void ip6_route_input(struct sk_buff *skb)
{
1173
	const struct ipv6hdr *iph = ipv6_hdr(skb);
1174
	struct net *net = dev_net(skb->dev);
1175
	int flags = RT6_LOOKUP_F_HAS_SADDR;
1176
	struct ip_tunnel_info *tun_info;
1177
	struct flowi6 fl6 = {
1178
		.flowi6_iif = skb->dev->ifindex,
1179 1180
		.daddr = iph->daddr,
		.saddr = iph->saddr,
1181
		.flowlabel = ip6_flowinfo(iph),
1182 1183
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
1184
	};
1185

1186
	tun_info = skb_tunnel_info(skb);
1187
	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1188
		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1189
	skb_dst_drop(skb);
1190
	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
T
Thomas Graf 已提交
1191 1192
}

1193
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1194
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
1195
{
1196
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
T
Thomas Graf 已提交
1197 1198
}

1199 1200
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
					 struct flowi6 *fl6, int flags)
T
Thomas Graf 已提交
1201
{
1202
	bool any_src;
T
Thomas Graf 已提交
1203

1204 1205 1206 1207 1208 1209 1210
	if (rt6_need_strict(&fl6->daddr)) {
		struct dst_entry *dst;

		dst = l3mdev_link_scope_lookup(net, fl6);
		if (dst)
			return dst;
	}
D
David Ahern 已提交
1211

1212
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
1213

1214
	any_src = ipv6_addr_any(&fl6->saddr);
1215
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1216
	    (fl6->flowi6_oif && any_src))
1217
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
1218

1219
	if (!any_src)
1220
		flags |= RT6_LOOKUP_F_HAS_SADDR;
1221 1222
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1223

1224
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
1225
}
1226
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
L
Linus Torvalds 已提交
1227

1228
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1229
{
1230
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1231 1232
	struct dst_entry *new = NULL;

1233
	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1234
	if (rt) {
1235
		rt6_info_init(rt);
1236

1237
		new = &rt->dst;
1238
		new->__use = 1;
1239
		new->input = dst_discard;
E
Eric W. Biederman 已提交
1240
		new->output = dst_discard_out;
1241

1242
		dst_copy_metrics(new, &ort->dst);
1243 1244 1245 1246
		rt->rt6i_idev = ort->rt6i_idev;
		if (rt->rt6i_idev)
			in6_dev_hold(rt->rt6i_idev);

A
Alexey Dobriyan 已提交
1247
		rt->rt6i_gateway = ort->rt6i_gateway;
1248
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1249 1250 1251 1252 1253 1254 1255 1256 1257 1258
		rt->rt6i_metric = 0;

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif

		dst_free(new);
	}

1259 1260
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
1261 1262
}

L
Linus Torvalds 已提交
1263 1264 1265 1266
/*
 *	Destination cache support functions
 */

1267 1268 1269 1270 1271 1272 1273
static void rt6_dst_from_metrics_check(struct rt6_info *rt)
{
	if (rt->dst.from &&
	    dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
		dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
}

1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
{
	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
		return NULL;

	if (rt6_check_expired(rt))
		return NULL;

	return &rt->dst;
}

static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
{
1287 1288
	if (!__rt6_check_expired(rt) &&
	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1289 1290 1291 1292 1293 1294
	    rt6_check((struct rt6_info *)(rt->dst.from), cookie))
		return &rt->dst;
	else
		return NULL;
}

L
Linus Torvalds 已提交
1295 1296 1297 1298 1299 1300
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
	struct rt6_info *rt;

	rt = (struct rt6_info *) dst;

1301 1302 1303 1304
	/* All IPV6 dsts are created with ->obsolete set to the value
	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
	 * into this function always.
	 */
1305

1306 1307
	rt6_dst_from_metrics_check(rt);

1308 1309
	if (rt->rt6i_flags & RTF_PCPU ||
	    (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1310 1311 1312
		return rt6_dst_from_check(rt, cookie);
	else
		return rt6_check(rt, cookie);
L
Linus Torvalds 已提交
1313 1314 1315 1316 1317 1318 1319
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
1320 1321 1322 1323 1324 1325
		if (rt->rt6i_flags & RTF_CACHE) {
			if (rt6_check_expired(rt)) {
				ip6_del_rt(rt);
				dst = NULL;
			}
		} else {
L
Linus Torvalds 已提交
1326
			dst_release(dst);
1327 1328
			dst = NULL;
		}
L
Linus Torvalds 已提交
1329
	}
1330
	return dst;
L
Linus Torvalds 已提交
1331 1332 1333 1334 1335 1336
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

1337
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
1338

E
Eric Dumazet 已提交
1339
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
1340
	if (rt) {
1341 1342
		if (rt->rt6i_flags & RTF_CACHE) {
			dst_hold(&rt->dst);
M
Martin KaFai Lau 已提交
1343
			ip6_del_rt(rt);
1344
		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) {
L
Linus Torvalds 已提交
1345
			rt->rt6i_node->fn_sernum = -1;
1346
		}
L
Linus Torvalds 已提交
1347 1348 1349
	}
}

1350 1351 1352 1353 1354 1355 1356 1357 1358
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
	struct net *net = dev_net(rt->dst.dev);

	rt->rt6i_flags |= RTF_MODIFIED;
	rt->rt6i_pmtu = mtu;
	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

1359 1360 1361 1362 1363 1364
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
	return !(rt->rt6i_flags & RTF_CACHE) &&
		(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
}

1365 1366
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
				 const struct ipv6hdr *iph, u32 mtu)
L
Linus Torvalds 已提交
1367
{
1368
	struct rt6_info *rt6 = (struct rt6_info *)dst;
L
Linus Torvalds 已提交
1369

1370 1371
	if (rt6->rt6i_flags & RTF_LOCAL)
		return;
1372

1373 1374 1375
	if (dst_metric_locked(dst, RTAX_MTU))
		return;

1376 1377 1378 1379
	dst_confirm(dst);
	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
	if (mtu >= dst_mtu(dst))
		return;
1380

1381
	if (!rt6_cache_allowed_for_pmtu(rt6)) {
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
		rt6_do_update_pmtu(rt6, mtu);
	} else {
		const struct in6_addr *daddr, *saddr;
		struct rt6_info *nrt6;

		if (iph) {
			daddr = &iph->daddr;
			saddr = &iph->saddr;
		} else if (sk) {
			daddr = &sk->sk_v6_daddr;
			saddr = &inet6_sk(sk)->saddr;
		} else {
			return;
		}
		nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
		if (nrt6) {
			rt6_do_update_pmtu(nrt6, mtu);

			/* ip6_ins_rt(nrt6) will bump the
			 * rt6->rt6i_node->fn_sernum
			 * which will fail the next rt6_check() and
			 * invalidate the sk->sk_dst_cache.
			 */
			ip6_ins_rt(nrt6);
		}
L
Linus Torvalds 已提交
1407 1408 1409
	}
}

1410 1411 1412 1413 1414 1415
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
{
	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}

1416
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1417
		     int oif, u32 mark, kuid_t uid)
1418 1419 1420 1421 1422 1423 1424
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
1425
	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1426 1427
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
1428
	fl6.flowlabel = ip6_flowinfo(iph);
1429
	fl6.flowi6_uid = uid;
1430 1431 1432

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
1433
		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1434 1435 1436 1437 1438 1439
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
1440 1441
	struct dst_entry *dst;

1442
	ip6_update_pmtu(skb, sock_net(sk), mtu,
1443
			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453

	dst = __sk_dst_get(sk);
	if (!dst || !dst->obsolete ||
	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
		return;

	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		ip6_datagram_dst_update(sk, false);
	bh_unlock_sock(sk);
1454 1455 1456
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
/* Handle redirects */
struct ip6rd_flowi {
	struct flowi6 fl6;
	struct in6_addr gateway;
};

static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
					     int flags)
{
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
	struct rt6_info *rt;
	struct fib6_node *fn;

	/* Get the "current" route for this destination and
A
Alexander Alemayhu 已提交
1473
	 * check if the redirect has come from appropriate router.
1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
	 *
	 * RFC 4861 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
	 */

	read_lock_bh(&table->tb6_lock);
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
		if (rt6_check_expired(rt))
			continue;
		if (rt->dst.error)
			break;
		if (!(rt->rt6i_flags & RTF_GATEWAY))
			continue;
		if (fl6->flowi6_oif != rt->dst.dev->ifindex)
			continue;
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
			continue;
		break;
	}

	if (!rt)
		rt = net->ipv6.ip6_null_entry;
	else if (rt->dst.error) {
		rt = net->ipv6.ip6_null_entry;
1503 1504 1505 1506
		goto out;
	}

	if (rt == net->ipv6.ip6_null_entry) {
M
Martin KaFai Lau 已提交
1507 1508 1509
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
1510
	}
M
Martin KaFai Lau 已提交
1511

1512
out:
1513 1514 1515 1516
	dst_hold(&rt->dst);

	read_unlock_bh(&table->tb6_lock);

D
David Ahern 已提交
1517
	trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534
	return rt;
};

static struct dst_entry *ip6_route_redirect(struct net *net,
					const struct flowi6 *fl6,
					const struct in6_addr *gateway)
{
	int flags = RT6_LOOKUP_F_HAS_SADDR;
	struct ip6rd_flowi rdfl;

	rdfl.fl6 = *fl6;
	rdfl.gateway = *gateway;

	return fib6_rule_lookup(net, &rdfl.fl6,
				flags, __ip6_route_redirect);
}

1535 1536
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
		  kuid_t uid)
1537 1538 1539 1540 1541 1542
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
1543
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1544 1545 1546 1547
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
1548
	fl6.flowlabel = ip6_flowinfo(iph);
1549
	fl6.flowi6_uid = uid;
1550

1551 1552
	dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
	rt6_do_redirect(dst, NULL, skb);
1553 1554 1555 1556
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

1557 1558 1559 1560 1561 1562 1563 1564 1565
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
			    u32 mark)
{
	const struct ipv6hdr *iph = ipv6_hdr(skb);
	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
1566
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
1567 1568 1569 1570
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = msg->dest;
	fl6.saddr = iph->daddr;
1571
	fl6.flowi6_uid = sock_net_uid(net, NULL);
1572

1573 1574
	dst = ip6_route_redirect(net, &fl6, &iph->saddr);
	rt6_do_redirect(dst, NULL, skb);
1575 1576 1577
	dst_release(dst);
}

1578 1579
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
1580 1581
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
		     sk->sk_uid);
1582 1583 1584
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

1585
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
1586
{
1587 1588 1589 1590
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
1591 1592
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

1593 1594
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
1595 1596

	/*
1597 1598 1599
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
1600 1601 1602 1603 1604 1605 1606
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

1607
static unsigned int ip6_mtu(const struct dst_entry *dst)
1608
{
1609 1610
	const struct rt6_info *rt = (const struct rt6_info *)dst;
	unsigned int mtu = rt->rt6i_pmtu;
1611
	struct inet6_dev *idev;
1612

1613 1614 1615 1616
	if (mtu)
		goto out;

	mtu = dst_metric_raw(dst, RTAX_MTU);
1617
	if (mtu)
E
Eric Dumazet 已提交
1618
		goto out;
1619 1620

	mtu = IPV6_MIN_MTU;
1621 1622 1623 1624 1625 1626 1627

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

E
Eric Dumazet 已提交
1628
out:
1629 1630 1631
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1632 1633
}

1634 1635
static struct dst_entry *icmp6_dst_gc_list;
static DEFINE_SPINLOCK(icmp6_dst_lock);
1636

1637
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1638
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
1639
{
1640
	struct dst_entry *dst;
L
Linus Torvalds 已提交
1641 1642
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
1643
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
1644

1645
	if (unlikely(!idev))
E
Eric Dumazet 已提交
1646
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
1647

1648
	rt = ip6_dst_alloc(net, dev, 0);
1649
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
1650
		in6_dev_put(idev);
1651
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
1652 1653 1654
		goto out;
	}

1655 1656
	rt->dst.flags |= DST_HOST;
	rt->dst.output  = ip6_output;
1657
	atomic_set(&rt->dst.__refcnt, 1);
1658
	rt->rt6i_gateway  = fl6->daddr;
1659
	rt->rt6i_dst.addr = fl6->daddr;
1660 1661
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
L
Li RongQing 已提交
1662
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
L
Linus Torvalds 已提交
1663

1664
	spin_lock_bh(&icmp6_dst_lock);
1665 1666
	rt->dst.next = icmp6_dst_gc_list;
	icmp6_dst_gc_list = &rt->dst;
1667
	spin_unlock_bh(&icmp6_dst_lock);
L
Linus Torvalds 已提交
1668

1669
	fib6_force_start_gc(net);
L
Linus Torvalds 已提交
1670

1671 1672
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
1673
out:
1674
	return dst;
L
Linus Torvalds 已提交
1675 1676
}

1677
int icmp6_dst_gc(void)
L
Linus Torvalds 已提交
1678
{
1679
	struct dst_entry *dst, **pprev;
1680
	int more = 0;
L
Linus Torvalds 已提交
1681

1682 1683
	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
1684

L
Linus Torvalds 已提交
1685 1686 1687 1688 1689 1690
	while ((dst = *pprev) != NULL) {
		if (!atomic_read(&dst->__refcnt)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
1691
			++more;
L
Linus Torvalds 已提交
1692 1693 1694
		}
	}

1695
	spin_unlock_bh(&icmp6_dst_lock);
1696

1697
	return more;
L
Linus Torvalds 已提交
1698 1699
}

1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
			    void *arg)
{
	struct dst_entry *dst, **pprev;

	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
	while ((dst = *pprev) != NULL) {
		struct rt6_info *rt = (struct rt6_info *) dst;
		if (func(rt, arg)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
		}
	}
	spin_unlock_bh(&icmp6_dst_lock);
}

1719
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
1720
{
1721
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1722 1723 1724 1725 1726
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1727
	int entries;
1728

1729
	entries = dst_entries_get_fast(ops);
1730
	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1731
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
1732 1733
		goto out;

1734
	net->ipv6.ip6_rt_gc_expire++;
1735
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1736 1737
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
1738
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
1739
out:
1740
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1741
	return entries > rt_max_size;
L
Linus Torvalds 已提交
1742 1743
}

1744 1745 1746
static int ip6_convert_metrics(struct mx6_config *mxc,
			       const struct fib6_config *cfg)
{
1747
	bool ecn_ca = false;
1748 1749 1750 1751
	struct nlattr *nla;
	int remaining;
	u32 *mp;

1752
	if (!cfg->fc_mx)
1753 1754 1755 1756 1757 1758 1759 1760
		return 0;

	mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
	if (unlikely(!mp))
		return -ENOMEM;

	nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
		int type = nla_type(nla);
1761
		u32 val;
1762

1763 1764 1765 1766
		if (!type)
			continue;
		if (unlikely(type > RTAX_MAX))
			goto err;
1767

1768 1769
		if (type == RTAX_CC_ALGO) {
			char tmp[TCP_CA_NAME_MAX];
1770

1771
			nla_strlcpy(tmp, nla, sizeof(tmp));
1772
			val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1773 1774 1775 1776
			if (val == TCP_CA_UNSPEC)
				goto err;
		} else {
			val = nla_get_u32(nla);
1777
		}
1778 1779
		if (type == RTAX_HOPLIMIT && val > 255)
			val = 255;
1780 1781
		if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
			goto err;
1782 1783 1784

		mp[type - 1] = val;
		__set_bit(type - 1, mxc->mx_valid);
1785 1786
	}

1787 1788 1789 1790
	if (ecn_ca) {
		__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
		mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
	}
1791

1792
	mxc->mx = mp;
1793 1794 1795 1796 1797
	return 0;
 err:
	kfree(mp);
	return -EINVAL;
}
L
Linus Torvalds 已提交
1798

1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
					    struct fib6_config *cfg,
					    const struct in6_addr *gw_addr)
{
	struct flowi6 fl6 = {
		.flowi6_oif = cfg->fc_ifindex,
		.daddr = *gw_addr,
		.saddr = cfg->fc_prefsrc,
	};
	struct fib6_table *table;
	struct rt6_info *rt;
1810
	int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829

	table = fib6_get_table(net, cfg->fc_table);
	if (!table)
		return NULL;

	if (!ipv6_addr_any(&cfg->fc_prefsrc))
		flags |= RT6_LOOKUP_F_HAS_SADDR;

	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);

	/* if table lookup failed, fall back to full lookup */
	if (rt == net->ipv6.ip6_null_entry) {
		ip6_rt_put(rt);
		rt = NULL;
	}

	return rt;
}

1830
static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
L
Linus Torvalds 已提交
1831
{
1832
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
1833 1834 1835
	struct rt6_info *rt = NULL;
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
1836
	struct fib6_table *table;
L
Linus Torvalds 已提交
1837
	int addr_type;
1838
	int err = -EINVAL;
L
Linus Torvalds 已提交
1839

1840
	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1841
		goto out;
L
Linus Torvalds 已提交
1842
#ifndef CONFIG_IPV6_SUBTREES
1843
	if (cfg->fc_src_len)
1844
		goto out;
L
Linus Torvalds 已提交
1845
#endif
1846
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
1847
		err = -ENODEV;
1848
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
1849 1850 1851 1852 1853 1854 1855
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

1856 1857
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
1858

1859
	err = -ENOBUFS;
1860 1861
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1862
		table = fib6_get_table(net, cfg->fc_table);
1863
		if (!table) {
1864
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1865 1866 1867 1868 1869
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
1870 1871

	if (!table)
T
Thomas Graf 已提交
1872 1873
		goto out;

1874 1875
	rt = ip6_dst_alloc(net, NULL,
			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
L
Linus Torvalds 已提交
1876

1877
	if (!rt) {
L
Linus Torvalds 已提交
1878 1879 1880 1881
		err = -ENOMEM;
		goto out;
	}

1882 1883 1884 1885 1886
	if (cfg->fc_flags & RTF_EXPIRES)
		rt6_set_expires(rt, jiffies +
				clock_t_to_jiffies(cfg->fc_expires));
	else
		rt6_clean_expires(rt);
L
Linus Torvalds 已提交
1887

1888 1889 1890 1891 1892
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
	rt->rt6i_protocol = cfg->fc_protocol;

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
1893 1894

	if (addr_type & IPV6_ADDR_MULTICAST)
1895
		rt->dst.input = ip6_mc_input;
1896 1897
	else if (cfg->fc_flags & RTF_LOCAL)
		rt->dst.input = ip6_input;
L
Linus Torvalds 已提交
1898
	else
1899
		rt->dst.input = ip6_forward;
L
Linus Torvalds 已提交
1900

1901
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
1902

1903 1904 1905
	if (cfg->fc_encap) {
		struct lwtunnel_state *lwtstate;

1906
		err = lwtunnel_build_state(cfg->fc_encap_type,
1907 1908
					   cfg->fc_encap, AF_INET6, cfg,
					   &lwtstate);
1909 1910
		if (err)
			goto out;
1911 1912 1913 1914
		rt->dst.lwtstate = lwtstate_get(lwtstate);
		if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
			rt->dst.lwtstate->orig_output = rt->dst.output;
			rt->dst.output = lwtunnel_output;
1915
		}
1916 1917 1918
		if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
			rt->dst.lwtstate->orig_input = rt->dst.input;
			rt->dst.input = lwtunnel_input;
1919
		}
1920 1921
	}

1922 1923
	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->rt6i_dst.plen = cfg->fc_dst_len;
1924
	if (rt->rt6i_dst.plen == 128)
1925 1926
		rt->dst.flags |= DST_HOST;

L
Linus Torvalds 已提交
1927
#ifdef CONFIG_IPV6_SUBTREES
1928 1929
	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->rt6i_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
1930 1931
#endif

1932
	rt->rt6i_metric = cfg->fc_metric;
L
Linus Torvalds 已提交
1933 1934 1935 1936

	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
1937
	if ((cfg->fc_flags & RTF_REJECT) ||
1938 1939 1940
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
1941
		/* hold loopback dev/idev if we haven't done so. */
1942
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
1943 1944 1945 1946
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
1947
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
1948 1949 1950 1951 1952 1953 1954 1955
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1956 1957 1958
		switch (cfg->fc_type) {
		case RTN_BLACKHOLE:
			rt->dst.error = -EINVAL;
E
Eric W. Biederman 已提交
1959
			rt->dst.output = dst_discard_out;
1960
			rt->dst.input = dst_discard;
1961 1962 1963
			break;
		case RTN_PROHIBIT:
			rt->dst.error = -EACCES;
1964 1965
			rt->dst.output = ip6_pkt_prohibit_out;
			rt->dst.input = ip6_pkt_prohibit;
1966
			break;
1967
		case RTN_THROW:
1968
		case RTN_UNREACHABLE:
1969
		default:
1970
			rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1971 1972
					: (cfg->fc_type == RTN_UNREACHABLE)
					? -EHOSTUNREACH : -ENETUNREACH;
1973 1974
			rt->dst.output = ip6_pkt_discard_out;
			rt->dst.input = ip6_pkt_discard;
1975 1976
			break;
		}
L
Linus Torvalds 已提交
1977 1978 1979
		goto install_route;
	}

1980
	if (cfg->fc_flags & RTF_GATEWAY) {
1981
		const struct in6_addr *gw_addr;
L
Linus Torvalds 已提交
1982 1983
		int gwa_type;

1984
		gw_addr = &cfg->fc_gateway;
1985
		gwa_type = ipv6_addr_type(gw_addr);
1986 1987 1988 1989 1990 1991 1992

		/* if gw_addr is local we will fail to detect this in case
		 * address is still TENTATIVE (DAD in progress). rt6_lookup()
		 * will return already-added prefix route via interface that
		 * prefix route was assigned to, which might be non-loopback.
		 */
		err = -EINVAL;
1993 1994 1995
		if (ipv6_chk_addr_and_flags(net, gw_addr,
					    gwa_type & IPV6_ADDR_LINKLOCAL ?
					    dev : NULL, 0, 0))
1996 1997
			goto out;

A
Alexey Dobriyan 已提交
1998
		rt->rt6i_gateway = *gw_addr;
L
Linus Torvalds 已提交
1999 2000

		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2001
			struct rt6_info *grt = NULL;
L
Linus Torvalds 已提交
2002 2003 2004 2005 2006 2007 2008

			/* IPv6 strictly inhibits using not link-local
			   addresses as nexthop address.
			   Otherwise, router will not able to send redirects.
			   It is very good, but in some (rare!) circumstances
			   (SIT, PtP, NBMA NOARP links) it is handy to allow
			   some exceptions. --ANK
2009 2010
			   We allow IPv4-mapped nexthops to support RFC4798-type
			   addressing
L
Linus Torvalds 已提交
2011
			 */
2012 2013
			if (!(gwa_type & (IPV6_ADDR_UNICAST |
					  IPV6_ADDR_MAPPED)))
L
Linus Torvalds 已提交
2014 2015
				goto out;

2016
			if (cfg->fc_table) {
2017 2018
				grt = ip6_nh_lookup_table(net, cfg, gw_addr);

2019 2020 2021 2022 2023 2024 2025 2026 2027
				if (grt) {
					if (grt->rt6i_flags & RTF_GATEWAY ||
					    (dev && dev != grt->dst.dev)) {
						ip6_rt_put(grt);
						grt = NULL;
					}
				}
			}

2028 2029 2030
			if (!grt)
				grt = rt6_lookup(net, gw_addr, NULL,
						 cfg->fc_ifindex, 1);
L
Linus Torvalds 已提交
2031 2032

			err = -EHOSTUNREACH;
2033
			if (!grt)
L
Linus Torvalds 已提交
2034 2035
				goto out;
			if (dev) {
2036
				if (dev != grt->dst.dev) {
A
Amerigo Wang 已提交
2037
					ip6_rt_put(grt);
L
Linus Torvalds 已提交
2038 2039 2040
					goto out;
				}
			} else {
2041
				dev = grt->dst.dev;
L
Linus Torvalds 已提交
2042 2043 2044 2045
				idev = grt->rt6i_idev;
				dev_hold(dev);
				in6_dev_hold(grt->rt6i_idev);
			}
2046
			if (!(grt->rt6i_flags & RTF_GATEWAY))
L
Linus Torvalds 已提交
2047
				err = 0;
A
Amerigo Wang 已提交
2048
			ip6_rt_put(grt);
L
Linus Torvalds 已提交
2049 2050 2051 2052 2053

			if (err)
				goto out;
		}
		err = -EINVAL;
2054
		if (!dev || (dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
2055 2056 2057 2058
			goto out;
	}

	err = -ENODEV;
2059
	if (!dev)
L
Linus Torvalds 已提交
2060 2061
		goto out;

2062 2063 2064 2065 2066
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
			err = -EINVAL;
			goto out;
		}
A
Alexey Dobriyan 已提交
2067
		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2068 2069 2070 2071
		rt->rt6i_prefsrc.plen = 128;
	} else
		rt->rt6i_prefsrc.plen = 0;

2072
	rt->rt6i_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
2073 2074

install_route:
2075
	rt->dst.dev = dev;
L
Linus Torvalds 已提交
2076
	rt->rt6i_idev = idev;
T
Thomas Graf 已提交
2077
	rt->rt6i_table = table;
2078

2079
	cfg->fc_nlinfo.nl_net = dev_net(dev);
2080

2081
	return rt;
2082 2083 2084 2085 2086 2087 2088 2089
out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);
	if (rt)
		dst_free(&rt->dst);

2090
	return ERR_PTR(err);
2091 2092 2093 2094 2095
}

int ip6_route_add(struct fib6_config *cfg)
{
	struct mx6_config mxc = { .mx = NULL, };
2096
	struct rt6_info *rt;
2097 2098
	int err;

2099 2100 2101 2102
	rt = ip6_route_info_create(cfg);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
2103
		goto out;
2104
	}
2105

2106 2107 2108
	err = ip6_convert_metrics(&mxc, cfg);
	if (err)
		goto out;
L
Linus Torvalds 已提交
2109

2110 2111 2112
	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);

	kfree(mxc.mx);
2113

2114
	return err;
L
Linus Torvalds 已提交
2115 2116
out:
	if (rt)
2117
		dst_free(&rt->dst);
2118

L
Linus Torvalds 已提交
2119 2120 2121
	return err;
}

2122
static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
2123 2124
{
	int err;
T
Thomas Graf 已提交
2125
	struct fib6_table *table;
2126
	struct net *net = dev_net(rt->dst.dev);
L
Linus Torvalds 已提交
2127

M
Martin KaFai Lau 已提交
2128 2129
	if (rt == net->ipv6.ip6_null_entry ||
	    rt->dst.flags & DST_NOCACHE) {
2130 2131 2132
		err = -ENOENT;
		goto out;
	}
2133

T
Thomas Graf 已提交
2134 2135
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
2136
	err = fib6_del(rt, info);
T
Thomas Graf 已提交
2137
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
2138

2139
out:
A
Amerigo Wang 已提交
2140
	ip6_rt_put(rt);
L
Linus Torvalds 已提交
2141 2142 2143
	return err;
}

2144 2145
int ip6_del_rt(struct rt6_info *rt)
{
2146
	struct nl_info info = {
2147
		.nl_net = dev_net(rt->dst.dev),
2148
	};
2149
	return __ip6_del_rt(rt, &info);
2150 2151
}

2152 2153 2154
static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
{
	struct nl_info *info = &cfg->fc_nlinfo;
2155
	struct sk_buff *skb = NULL;
2156 2157 2158 2159 2160 2161 2162 2163 2164
	struct fib6_table *table;
	int err;

	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);

	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
		struct rt6_info *sibling, *next_sibling;

2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178
		/* prefer to send a single notification with all hops */
		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
		if (skb) {
			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

			if (rt6_fill_node(info->nl_net, skb, rt,
					  NULL, NULL, 0, RTM_DELROUTE,
					  info->portid, seq, 0) < 0) {
				kfree_skb(skb);
				skb = NULL;
			} else
				info->skip_notify = 1;
		}

2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191
		list_for_each_entry_safe(sibling, next_sibling,
					 &rt->rt6i_siblings,
					 rt6i_siblings) {
			err = fib6_del(sibling, info);
			if (err)
				goto out;
		}
	}

	err = fib6_del(rt, info);
out:
	write_unlock_bh(&table->tb6_lock);
	ip6_rt_put(rt);
2192 2193 2194 2195 2196

	if (skb) {
		rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV6_ROUTE,
			    info->nlh, gfp_any());
	}
2197 2198 2199
	return err;
}

2200
static int ip6_route_del(struct fib6_config *cfg)
L
Linus Torvalds 已提交
2201
{
T
Thomas Graf 已提交
2202
	struct fib6_table *table;
L
Linus Torvalds 已提交
2203 2204 2205 2206
	struct fib6_node *fn;
	struct rt6_info *rt;
	int err = -ESRCH;

2207
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2208
	if (!table)
T
Thomas Graf 已提交
2209 2210 2211
		return err;

	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
2212

T
Thomas Graf 已提交
2213
	fn = fib6_locate(&table->tb6_root,
2214 2215
			 &cfg->fc_dst, cfg->fc_dst_len,
			 &cfg->fc_src, cfg->fc_src_len);
2216

L
Linus Torvalds 已提交
2217
	if (fn) {
2218
		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2219 2220 2221
			if ((rt->rt6i_flags & RTF_CACHE) &&
			    !(cfg->fc_flags & RTF_CACHE))
				continue;
2222
			if (cfg->fc_ifindex &&
2223 2224
			    (!rt->dst.dev ||
			     rt->dst.dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
2225
				continue;
2226 2227
			if (cfg->fc_flags & RTF_GATEWAY &&
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
L
Linus Torvalds 已提交
2228
				continue;
2229
			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
L
Linus Torvalds 已提交
2230
				continue;
2231 2232
			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
				continue;
2233
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
2234
			read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
2235

2236 2237 2238 2239 2240
			/* if gateway was specified only delete the one hop */
			if (cfg->fc_flags & RTF_GATEWAY)
				return __ip6_del_rt(rt, &cfg->fc_nlinfo);

			return __ip6_del_rt_siblings(rt, cfg);
L
Linus Torvalds 已提交
2241 2242
		}
	}
T
Thomas Graf 已提交
2243
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
2244 2245 2246 2247

	return err;
}

2248
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2249 2250
{
	struct netevent_redirect netevent;
2251 2252 2253 2254
	struct rt6_info *rt, *nrt = NULL;
	struct ndisc_options ndopts;
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
2255
	struct rd_msg *msg;
2256 2257
	int optlen, on_link;
	u8 *lladdr;
2258

2259
	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2260
	optlen -= sizeof(*msg);
2261 2262

	if (optlen < 0) {
2263
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2264 2265 2266
		return;
	}

2267
	msg = (struct rd_msg *)icmp6_hdr(skb);
2268

2269
	if (ipv6_addr_is_multicast(&msg->dest)) {
2270
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2271 2272 2273
		return;
	}

2274
	on_link = 0;
2275
	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2276
		on_link = 1;
2277
	} else if (ipv6_addr_type(&msg->target) !=
2278
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2279
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

2294
	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2295 2296 2297
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
2298 2299

	lladdr = NULL;
2300 2301 2302 2303 2304 2305 2306 2307 2308
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

2309
	rt = (struct rt6_info *) dst;
2310
	if (rt->rt6i_flags & RTF_REJECT) {
2311
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2312
		return;
2313
	}
2314

2315 2316 2317 2318 2319
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
	dst_confirm(&rt->dst);
2320

2321
	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2322 2323
	if (!neigh)
		return;
2324

L
Linus Torvalds 已提交
2325 2326 2327 2328
	/*
	 *	We have finally decided to accept it.
	 */

2329
	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
2330 2331 2332
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2333 2334
				     NEIGH_UPDATE_F_ISROUTER)),
		     NDISC_REDIRECT, &ndopts);
L
Linus Torvalds 已提交
2335

M
Martin KaFai Lau 已提交
2336
	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2337
	if (!nrt)
L
Linus Torvalds 已提交
2338 2339 2340 2341 2342 2343
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
2344
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
L
Linus Torvalds 已提交
2345

2346
	if (ip6_ins_rt(nrt))
L
Linus Torvalds 已提交
2347 2348
		goto out;

2349 2350
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
2351
	netevent.daddr = &msg->dest;
2352
	netevent.neigh = neigh;
2353 2354
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

2355
	if (rt->rt6i_flags & RTF_CACHE) {
2356
		rt = (struct rt6_info *) dst_clone(&rt->dst);
2357
		ip6_del_rt(rt);
L
Linus Torvalds 已提交
2358 2359 2360
	}

out:
2361
	neigh_release(neigh);
2362 2363
}

L
Linus Torvalds 已提交
2364 2365 2366 2367
/*
 *	Misc support functions
 */

2368 2369 2370 2371 2372 2373 2374 2375 2376 2377
static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
	BUG_ON(from->dst.from);

	rt->rt6i_flags &= ~RTF_EXPIRES;
	dst_hold(&from->dst);
	rt->dst.from = &from->dst;
	dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
}

M
Martin KaFai Lau 已提交
2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391
static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
{
	rt->dst.input = ort->dst.input;
	rt->dst.output = ort->dst.output;
	rt->rt6i_dst = ort->rt6i_dst;
	rt->dst.error = ort->dst.error;
	rt->rt6i_idev = ort->rt6i_idev;
	if (rt->rt6i_idev)
		in6_dev_hold(rt->rt6i_idev);
	rt->dst.lastuse = jiffies;
	rt->rt6i_gateway = ort->rt6i_gateway;
	rt->rt6i_flags = ort->rt6i_flags;
	rt6_set_from(rt, ort);
	rt->rt6i_metric = ort->rt6i_metric;
L
Linus Torvalds 已提交
2392
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
2393
	rt->rt6i_src = ort->rt6i_src;
L
Linus Torvalds 已提交
2394
#endif
M
Martin KaFai Lau 已提交
2395 2396
	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
	rt->rt6i_table = ort->rt6i_table;
2397
	rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
L
Linus Torvalds 已提交
2398 2399
}

2400
#ifdef CONFIG_IPV6_ROUTE_INFO
2401
static struct rt6_info *rt6_get_route_info(struct net *net,
2402
					   const struct in6_addr *prefix, int prefixlen,
2403 2404
					   const struct in6_addr *gwaddr,
					   struct net_device *dev)
2405
{
2406 2407
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
	int ifindex = dev->ifindex;
2408 2409
	struct fib6_node *fn;
	struct rt6_info *rt = NULL;
T
Thomas Graf 已提交
2410 2411
	struct fib6_table *table;

2412
	table = fib6_get_table(net, tb_id);
2413
	if (!table)
T
Thomas Graf 已提交
2414
		return NULL;
2415

2416
	read_lock_bh(&table->tb6_lock);
2417
	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2418 2419 2420
	if (!fn)
		goto out;

2421
	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2422
		if (rt->dst.dev->ifindex != ifindex)
2423 2424 2425 2426 2427
			continue;
		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
			continue;
		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
			continue;
2428
		dst_hold(&rt->dst);
2429 2430 2431
		break;
	}
out:
2432
	read_unlock_bh(&table->tb6_lock);
2433 2434 2435
	return rt;
}

2436
static struct rt6_info *rt6_add_route_info(struct net *net,
2437
					   const struct in6_addr *prefix, int prefixlen,
2438 2439
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
2440
					   unsigned int pref)
2441
{
2442
	struct fib6_config cfg = {
2443
		.fc_metric	= IP6_RT_PRIO_USER,
2444
		.fc_ifindex	= dev->ifindex,
2445 2446 2447
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
2448
		.fc_nlinfo.portid = 0,
2449 2450
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
2451 2452
	};

2453
	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
A
Alexey Dobriyan 已提交
2454 2455
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
2456

2457 2458
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
2459
		cfg.fc_flags |= RTF_DEFAULT;
2460

2461
	ip6_route_add(&cfg);
2462

2463
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2464 2465 2466
}
#endif

2467
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2468
{
2469
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
L
Linus Torvalds 已提交
2470
	struct rt6_info *rt;
T
Thomas Graf 已提交
2471
	struct fib6_table *table;
L
Linus Torvalds 已提交
2472

2473
	table = fib6_get_table(dev_net(dev), tb_id);
2474
	if (!table)
T
Thomas Graf 已提交
2475
		return NULL;
L
Linus Torvalds 已提交
2476

2477
	read_lock_bh(&table->tb6_lock);
2478
	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2479
		if (dev == rt->dst.dev &&
2480
		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
L
Linus Torvalds 已提交
2481 2482 2483 2484
		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
			break;
	}
	if (rt)
2485
		dst_hold(&rt->dst);
2486
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
2487 2488 2489
	return rt;
}

2490
struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2491 2492
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
2493
{
2494
	struct fib6_config cfg = {
D
David Ahern 已提交
2495
		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2496
		.fc_metric	= IP6_RT_PRIO_USER,
2497 2498 2499
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2500
		.fc_nlinfo.portid = 0,
2501
		.fc_nlinfo.nlh = NULL,
2502
		.fc_nlinfo.nl_net = dev_net(dev),
2503
	};
L
Linus Torvalds 已提交
2504

A
Alexey Dobriyan 已提交
2505
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
2506

2507 2508 2509 2510 2511 2512 2513
	if (!ip6_route_add(&cfg)) {
		struct fib6_table *table;

		table = fib6_get_table(dev_net(dev), cfg.fc_table);
		if (table)
			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
	}
L
Linus Torvalds 已提交
2514 2515 2516 2517

	return rt6_get_dflt_router(gwaddr, dev);
}

2518
static void __rt6_purge_dflt_routers(struct fib6_table *table)
L
Linus Torvalds 已提交
2519 2520 2521 2522
{
	struct rt6_info *rt;

restart:
T
Thomas Graf 已提交
2523
	read_lock_bh(&table->tb6_lock);
2524
	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2525 2526
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2527
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
2528
			read_unlock_bh(&table->tb6_lock);
2529
			ip6_del_rt(rt);
L
Linus Torvalds 已提交
2530 2531 2532
			goto restart;
		}
	}
T
Thomas Graf 已提交
2533
	read_unlock_bh(&table->tb6_lock);
2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554

	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
	struct fib6_table *table;
	struct hlist_head *head;
	unsigned int h;

	rcu_read_lock();

	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
		head = &net->ipv6.fib_table_hash[h];
		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
				__rt6_purge_dflt_routers(table);
		}
	}

	rcu_read_unlock();
L
Linus Torvalds 已提交
2555 2556
}

2557 2558
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
2559 2560 2561 2562
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

D
David Ahern 已提交
2563 2564
	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
			 : RT6_TABLE_MAIN;
2565 2566 2567 2568 2569 2570 2571
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;

2572
	cfg->fc_nlinfo.nl_net = net;
2573

A
Alexey Dobriyan 已提交
2574 2575 2576
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
2577 2578
}

2579
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
2580
{
2581
	struct fib6_config cfg;
L
Linus Torvalds 已提交
2582 2583 2584
	struct in6_rtmsg rtmsg;
	int err;

2585
	switch (cmd) {
L
Linus Torvalds 已提交
2586 2587
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
2588
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
2589 2590 2591 2592 2593
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
2594

2595
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2596

L
Linus Torvalds 已提交
2597 2598 2599
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
2600
			err = ip6_route_add(&cfg);
L
Linus Torvalds 已提交
2601 2602
			break;
		case SIOCDELRT:
2603
			err = ip6_route_del(&cfg);
L
Linus Torvalds 已提交
2604 2605 2606 2607 2608 2609 2610
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
2611
	}
L
Linus Torvalds 已提交
2612 2613 2614 2615 2616 2617 2618 2619

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

2620
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
2621
{
2622
	int type;
E
Eric Dumazet 已提交
2623
	struct dst_entry *dst = skb_dst(skb);
2624 2625
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
2626
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
2627
		if (type == IPV6_ADDR_ANY) {
2628 2629
			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
				      IPSTATS_MIB_INADDRERRORS);
2630 2631 2632 2633
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
2634 2635
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
2636 2637
		break;
	}
2638
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
2639 2640 2641 2642
	kfree_skb(skb);
	return 0;
}

2643 2644
static int ip6_pkt_discard(struct sk_buff *skb)
{
2645
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2646 2647
}

E
Eric W. Biederman 已提交
2648
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
2649
{
E
Eric Dumazet 已提交
2650
	skb->dev = skb_dst(skb)->dev;
2651
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
2652 2653
}

2654 2655
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
2656
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2657 2658
}

E
Eric W. Biederman 已提交
2659
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2660
{
E
Eric Dumazet 已提交
2661
	skb->dev = skb_dst(skb)->dev;
2662
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2663 2664
}

L
Linus Torvalds 已提交
2665 2666 2667 2668 2669 2670
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
				    const struct in6_addr *addr,
2671
				    bool anycast)
L
Linus Torvalds 已提交
2672
{
D
David Ahern 已提交
2673
	u32 tb_id;
2674
	struct net *net = dev_net(idev->dev);
2675 2676 2677 2678 2679 2680 2681 2682 2683 2684
	struct net_device *dev = net->loopback_dev;
	struct rt6_info *rt;

	/* use L3 Master device as loopback for host routes if device
	 * is enslaved and address is not link local or multicast
	 */
	if (!rt6_need_strict(addr))
		dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;

	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2685
	if (!rt)
L
Linus Torvalds 已提交
2686 2687 2688 2689
		return ERR_PTR(-ENOMEM);

	in6_dev_hold(idev);

2690
	rt->dst.flags |= DST_HOST;
2691 2692
	rt->dst.input = ip6_input;
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
2693 2694
	rt->rt6i_idev = idev;

2695
	rt->rt6i_protocol = RTPROT_KERNEL;
L
Linus Torvalds 已提交
2696
	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2697 2698 2699
	if (anycast)
		rt->rt6i_flags |= RTF_ANYCAST;
	else
L
Linus Torvalds 已提交
2700 2701
		rt->rt6i_flags |= RTF_LOCAL;

2702
	rt->rt6i_gateway  = *addr;
A
Alexey Dobriyan 已提交
2703
	rt->rt6i_dst.addr = *addr;
L
Linus Torvalds 已提交
2704
	rt->rt6i_dst.plen = 128;
D
David Ahern 已提交
2705 2706
	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
	rt->rt6i_table = fib6_get_table(net, tb_id);
M
Martin KaFai Lau 已提交
2707
	rt->dst.flags |= DST_NOCACHE;
L
Linus Torvalds 已提交
2708

2709
	atomic_set(&rt->dst.__refcnt, 1);
L
Linus Torvalds 已提交
2710 2711 2712 2713

	return rt;
}

2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

2727
	if (((void *)rt->dst.dev == dev || !dev) &&
2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743
	    rt != net->ipv6.ip6_null_entry &&
	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
		/* remove prefsrc entry */
		rt->rt6i_prefsrc.plen = 0;
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
2744
	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2745 2746
}

2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767
#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)

/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
{
	struct in6_addr *gateway = (struct in6_addr *)arg;

	if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
	     ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
	     ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
		return -1;
	}
	return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
	fib6_clean_all(net, fib6_clean_tohost, gateway);
}

2768 2769 2770 2771 2772
struct arg_dev_net {
	struct net_device *dev;
	struct net *net;
};

2773
/* called with write lock held for table with rt */
L
Linus Torvalds 已提交
2774 2775
static int fib6_ifdown(struct rt6_info *rt, void *arg)
{
S
stephen hemminger 已提交
2776 2777
	const struct arg_dev_net *adn = arg;
	const struct net_device *dev = adn->dev;
2778

2779
	if ((rt->dst.dev == dev || !dev) &&
2780 2781 2782
	    rt != adn->net->ipv6.ip6_null_entry &&
	    (rt->rt6i_nsiblings == 0 ||
	     !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
L
Linus Torvalds 已提交
2783
		return -1;
2784

L
Linus Torvalds 已提交
2785 2786 2787
	return 0;
}

2788
void rt6_ifdown(struct net *net, struct net_device *dev)
L
Linus Torvalds 已提交
2789
{
2790 2791 2792 2793 2794
	struct arg_dev_net adn = {
		.dev = dev,
		.net = net,
	};

2795
	fib6_clean_all(net, fib6_ifdown, &adn);
2796
	icmp6_clean_all(fib6_ifdown, &adn);
2797 2798
	if (dev)
		rt6_uncached_list_flush_dev(net, dev);
L
Linus Torvalds 已提交
2799 2800
}

2801
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
2802
	struct net_device *dev;
2803
	unsigned int mtu;
L
Linus Torvalds 已提交
2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817
};

static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
2818
	if (!idev)
L
Linus Torvalds 已提交
2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
	/*
	   If new MTU is less than route PMTU, this new MTU will be the
	   lowest MTU in the path, update the route PMTU to reflect PMTU
	   decreases; if new MTU is greater than route PMTU, and the
	   old MTU is the lowest MTU in the path, update the route PMTU
	   to reflect the increase. In this case if the other nodes' MTU
	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
A
Alexander Alemayhu 已提交
2833
	   PMTU discovery.
L
Linus Torvalds 已提交
2834
	 */
2835
	if (rt->dst.dev == arg->dev &&
2836
	    dst_metric_raw(&rt->dst, RTAX_MTU) &&
2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850
	    !dst_metric_locked(&rt->dst, RTAX_MTU)) {
		if (rt->rt6i_flags & RTF_CACHE) {
			/* For RTF_CACHE with rt6i_pmtu == 0
			 * (i.e. a redirected route),
			 * the metrics of its rt->dst.from has already
			 * been updated.
			 */
			if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
				rt->rt6i_pmtu = arg->mtu;
		} else if (dst_mtu(&rt->dst) >= arg->mtu ||
			   (dst_mtu(&rt->dst) < arg->mtu &&
			    dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
			dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
		}
2851
	}
L
Linus Torvalds 已提交
2852 2853 2854
	return 0;
}

2855
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
2856
{
T
Thomas Graf 已提交
2857 2858 2859 2860
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
2861

2862
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
L
Linus Torvalds 已提交
2863 2864
}

2865
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2866
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2867
	[RTA_OIF]               = { .type = NLA_U32 },
2868
	[RTA_IIF]		= { .type = NLA_U32 },
2869 2870
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
2871
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
2872
	[RTA_PREF]              = { .type = NLA_U8 },
2873 2874
	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
	[RTA_ENCAP]		= { .type = NLA_NESTED },
2875
	[RTA_EXPIRES]		= { .type = NLA_U32 },
2876
	[RTA_UID]		= { .type = NLA_U32 },
2877 2878 2879 2880
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct fib6_config *cfg)
L
Linus Torvalds 已提交
2881
{
2882 2883
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
2884
	unsigned int pref;
2885
	int err;
L
Linus Torvalds 已提交
2886

2887 2888 2889
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
2890

2891 2892 2893 2894 2895 2896 2897 2898 2899
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;
2900
	cfg->fc_type = rtm->rtm_type;
2901

2902 2903
	if (rtm->rtm_type == RTN_UNREACHABLE ||
	    rtm->rtm_type == RTN_BLACKHOLE ||
2904 2905
	    rtm->rtm_type == RTN_PROHIBIT ||
	    rtm->rtm_type == RTN_THROW)
2906 2907
		cfg->fc_flags |= RTF_REJECT;

2908 2909 2910
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

2911 2912 2913
	if (rtm->rtm_flags & RTM_F_CLONED)
		cfg->fc_flags |= RTF_CACHE;

2914
	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2915
	cfg->fc_nlinfo.nlh = nlh;
2916
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2917 2918

	if (tb[RTA_GATEWAY]) {
2919
		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2920
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
2921
	}
2922 2923 2924 2925 2926 2927 2928 2929

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
2930
	}
2931 2932 2933 2934 2935 2936 2937 2938

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
2939
	}
2940

2941
	if (tb[RTA_PREFSRC])
2942
		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2943

2944 2945 2946 2947 2948 2949 2950 2951 2952
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
2953
	}
2954 2955 2956 2957

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

2958 2959 2960
	if (tb[RTA_MULTIPATH]) {
		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2961 2962 2963 2964 2965

		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
						     cfg->fc_mp_len);
		if (err < 0)
			goto errout;
2966 2967
	}

2968 2969 2970 2971 2972 2973 2974 2975
	if (tb[RTA_PREF]) {
		pref = nla_get_u8(tb[RTA_PREF]);
		if (pref != ICMPV6_ROUTER_PREF_LOW &&
		    pref != ICMPV6_ROUTER_PREF_HIGH)
			pref = ICMPV6_ROUTER_PREF_MEDIUM;
		cfg->fc_flags |= RTF_PREF(pref);
	}

2976 2977 2978
	if (tb[RTA_ENCAP])
		cfg->fc_encap = tb[RTA_ENCAP];

2979
	if (tb[RTA_ENCAP_TYPE]) {
2980 2981
		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

2982 2983 2984 2985 2986
		err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
		if (err < 0)
			goto errout;
	}

2987 2988 2989 2990 2991 2992 2993 2994 2995
	if (tb[RTA_EXPIRES]) {
		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

		if (addrconf_finite_timeout(timeout)) {
			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
			cfg->fc_flags |= RTF_EXPIRES;
		}
	}

2996 2997 2998
	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
2999 3000
}

3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051
struct rt6_nh {
	struct rt6_info *rt6_info;
	struct fib6_config r_cfg;
	struct mx6_config mxc;
	struct list_head next;
};

static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
{
	struct rt6_nh *nh;

	list_for_each_entry(nh, rt6_nh_list, next) {
		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
		        nh->r_cfg.fc_ifindex);
	}
}

static int ip6_route_info_append(struct list_head *rt6_nh_list,
				 struct rt6_info *rt, struct fib6_config *r_cfg)
{
	struct rt6_nh *nh;
	struct rt6_info *rtnh;
	int err = -EEXIST;

	list_for_each_entry(nh, rt6_nh_list, next) {
		/* check if rt6_info already exists */
		rtnh = nh->rt6_info;

		if (rtnh->dst.dev == rt->dst.dev &&
		    rtnh->rt6i_idev == rt->rt6i_idev &&
		    ipv6_addr_equal(&rtnh->rt6i_gateway,
				    &rt->rt6i_gateway))
			return err;
	}

	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
	if (!nh)
		return -ENOMEM;
	nh->rt6_info = rt;
	err = ip6_convert_metrics(&nh->mxc, r_cfg);
	if (err) {
		kfree(nh);
		return err;
	}
	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
	list_add_tail(&nh->next, rt6_nh_list);

	return 0;
}

3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072
static void ip6_route_mpath_notify(struct rt6_info *rt,
				   struct rt6_info *rt_last,
				   struct nl_info *info,
				   __u16 nlflags)
{
	/* if this is an APPEND route, then rt points to the first route
	 * inserted and rt_last points to last route inserted. Userspace
	 * wants a consistent dump of the route which starts at the first
	 * nexthop. Since sibling routes are always added at the end of
	 * the list, find the first sibling of the last route appended
	 */
	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
		rt = list_first_entry(&rt_last->rt6i_siblings,
				      struct rt6_info,
				      rt6i_siblings);
	}

	if (rt)
		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

3073
static int ip6_route_multipath_add(struct fib6_config *cfg)
3074
{
3075 3076
	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
	struct nl_info *info = &cfg->fc_nlinfo;
3077 3078
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
3079 3080 3081
	struct rt6_info *rt;
	struct rt6_nh *err_nh;
	struct rt6_nh *nh, *nh_safe;
3082
	__u16 nlflags;
3083 3084
	int remaining;
	int attrlen;
3085 3086 3087 3088 3089
	int err = 1;
	int nhn = 0;
	int replace = (cfg->fc_nlinfo.nlh &&
		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
	LIST_HEAD(rt6_nh_list);
3090

3091 3092 3093 3094
	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
		nlflags |= NLM_F_APPEND;

3095
	remaining = cfg->fc_mp_len;
3096 3097
	rtnh = (struct rtnexthop *)cfg->fc_mp;

3098 3099 3100
	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
	 * rt6_info structs per nexthop
	 */
3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
3112
				r_cfg.fc_gateway = nla_get_in6_addr(nla);
3113 3114
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
3115 3116 3117 3118
			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				r_cfg.fc_encap_type = nla_get_u16(nla);
3119
		}
3120

3121 3122 3123 3124
		rt = ip6_route_info_create(&r_cfg);
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
3125
			goto cleanup;
3126
		}
3127 3128

		err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3129
		if (err) {
3130 3131 3132 3133 3134 3135 3136
			dst_free(&rt->dst);
			goto cleanup;
		}

		rtnh = rtnh_next(rtnh, &remaining);
	}

3137 3138 3139 3140 3141 3142
	/* for add and replace send one notification with all nexthops.
	 * Skip the notification in fib6_add_rt2node and send one with
	 * the full route when done
	 */
	info->skip_notify = 1;

3143 3144
	err_nh = NULL;
	list_for_each_entry(nh, &rt6_nh_list, next) {
3145 3146 3147 3148 3149 3150
		rt_last = nh->rt6_info;
		err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc);
		/* save reference to first route for notification */
		if (!rt_notif && !err)
			rt_notif = nh->rt6_info;

3151 3152 3153 3154 3155 3156 3157
		/* nh->rt6_info is used or freed at this point, reset to NULL*/
		nh->rt6_info = NULL;
		if (err) {
			if (replace && nhn)
				ip6_print_replace_route_err(&rt6_nh_list);
			err_nh = nh;
			goto add_errout;
3158
		}
3159

3160
		/* Because each route is added like a single route we remove
3161 3162 3163 3164 3165
		 * these flags after the first nexthop: if there is a collision,
		 * we have already failed to add the first nexthop:
		 * fib6_add_rt2node() has rejected it; when replacing, old
		 * nexthops have been replaced by first new, the rest should
		 * be added to it.
3166
		 */
3167 3168
		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
						     NLM_F_REPLACE);
3169 3170 3171
		nhn++;
	}

3172 3173
	/* success ... tell user about new route */
	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3174 3175 3176
	goto cleanup;

add_errout:
3177 3178 3179 3180 3181 3182 3183
	/* send notification for routes that were added so that
	 * the delete notifications sent by ip6_route_del are
	 * coherent
	 */
	if (rt_notif)
		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194
	/* Delete routes that were already added */
	list_for_each_entry(nh, &rt6_nh_list, next) {
		if (err_nh == nh)
			break;
		ip6_route_del(&nh->r_cfg);
	}

cleanup:
	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
		if (nh->rt6_info)
			dst_free(&nh->rt6_info->dst);
3195
		kfree(nh->mxc.mx);
3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233
		list_del(&nh->next);
		kfree(nh);
	}

	return err;
}

static int ip6_route_multipath_del(struct fib6_config *cfg)
{
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
	int remaining;
	int attrlen;
	int err = 1, last_err = 0;

	remaining = cfg->fc_mp_len;
	rtnh = (struct rtnexthop *)cfg->fc_mp;

	/* Parse a Multipath Entry */
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
		}
		err = ip6_route_del(&r_cfg);
		if (err)
			last_err = err;

3234 3235 3236 3237 3238 3239
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return last_err;
}

3240
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
L
Linus Torvalds 已提交
3241
{
3242 3243
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
3244

3245 3246 3247 3248
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

3249
	if (cfg.fc_mp)
3250
		return ip6_route_multipath_del(&cfg);
3251 3252
	else {
		cfg.fc_delete_all_nh = 1;
3253
		return ip6_route_del(&cfg);
3254
	}
L
Linus Torvalds 已提交
3255 3256
}

3257
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
L
Linus Torvalds 已提交
3258
{
3259 3260
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
3261

3262 3263 3264 3265
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

3266
	if (cfg.fc_mp)
3267
		return ip6_route_multipath_add(&cfg);
3268 3269
	else
		return ip6_route_add(&cfg);
L
Linus Torvalds 已提交
3270 3271
}

3272
static size_t rt6_nlmsg_size(struct rt6_info *rt)
3273
{
3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285
	int nexthop_len = 0;

	if (rt->rt6i_nsiblings) {
		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
			    + NLA_ALIGN(sizeof(struct rtnexthop))
			    + nla_total_size(16) /* RTA_GATEWAY */
			    + nla_total_size(4)  /* RTA_OIF */
			    + lwtunnel_get_encap_size(rt->dst.lwtstate);

		nexthop_len *= rt->rt6i_nsiblings;
	}

3286 3287 3288 3289 3290 3291 3292 3293 3294
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
3295
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3296
	       + nla_total_size(sizeof(struct rta_cacheinfo))
3297
	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3298
	       + nla_total_size(1) /* RTA_PREF */
3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354
	       + lwtunnel_get_encap_size(rt->dst.lwtstate)
	       + nexthop_len;
}

static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
			    unsigned int *flags)
{
	if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
		*flags |= RTNH_F_LINKDOWN;
		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
			*flags |= RTNH_F_DEAD;
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
			goto nla_put_failure;
	}

	if (rt->dst.dev &&
	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
		goto nla_put_failure;

	if (rt->dst.lwtstate &&
	    lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
{
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

	rtnh->rtnh_hops = 0;
	rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;

	if (rt6_nexthop_info(skb, rt, &flags) < 0)
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
3355 3356
}

3357 3358
static int rt6_fill_node(struct net *net,
			 struct sk_buff *skb, struct rt6_info *rt,
3359
			 struct in6_addr *dst, struct in6_addr *src,
3360
			 int iif, int type, u32 portid, u32 seq,
3361
			 unsigned int flags)
L
Linus Torvalds 已提交
3362
{
3363
	u32 metrics[RTAX_MAX];
L
Linus Torvalds 已提交
3364
	struct rtmsg *rtm;
3365
	struct nlmsghdr *nlh;
3366
	long expires;
3367
	u32 table;
L
Linus Torvalds 已提交
3368

3369
	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3370
	if (!nlh)
3371
		return -EMSGSIZE;
3372 3373

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
3374 3375 3376 3377
	rtm->rtm_family = AF_INET6;
	rtm->rtm_dst_len = rt->rt6i_dst.plen;
	rtm->rtm_src_len = rt->rt6i_src.plen;
	rtm->rtm_tos = 0;
T
Thomas Graf 已提交
3378
	if (rt->rt6i_table)
3379
		table = rt->rt6i_table->tb6_id;
T
Thomas Graf 已提交
3380
	else
3381 3382
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
D
David S. Miller 已提交
3383 3384
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
3385 3386 3387 3388 3389 3390 3391 3392
	if (rt->rt6i_flags & RTF_REJECT) {
		switch (rt->dst.error) {
		case -EINVAL:
			rtm->rtm_type = RTN_BLACKHOLE;
			break;
		case -EACCES:
			rtm->rtm_type = RTN_PROHIBIT;
			break;
3393 3394 3395
		case -EAGAIN:
			rtm->rtm_type = RTN_THROW;
			break;
3396 3397 3398 3399 3400
		default:
			rtm->rtm_type = RTN_UNREACHABLE;
			break;
		}
	}
3401
	else if (rt->rt6i_flags & RTF_LOCAL)
3402
		rtm->rtm_type = RTN_LOCAL;
3403
	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
3404 3405 3406 3407 3408 3409
		rtm->rtm_type = RTN_LOCAL;
	else
		rtm->rtm_type = RTN_UNICAST;
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
	rtm->rtm_protocol = rt->rt6i_protocol;
3410
	if (rt->rt6i_flags & RTF_DYNAMIC)
L
Linus Torvalds 已提交
3411
		rtm->rtm_protocol = RTPROT_REDIRECT;
3412 3413 3414 3415 3416 3417
	else if (rt->rt6i_flags & RTF_ADDRCONF) {
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
			rtm->rtm_protocol = RTPROT_RA;
		else
			rtm->rtm_protocol = RTPROT_KERNEL;
	}
L
Linus Torvalds 已提交
3418

3419
	if (rt->rt6i_flags & RTF_CACHE)
L
Linus Torvalds 已提交
3420 3421 3422
		rtm->rtm_flags |= RTM_F_CLONED;

	if (dst) {
3423
		if (nla_put_in6_addr(skb, RTA_DST, dst))
D
David S. Miller 已提交
3424
			goto nla_put_failure;
3425
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
3426
	} else if (rtm->rtm_dst_len)
3427
		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
D
David S. Miller 已提交
3428
			goto nla_put_failure;
L
Linus Torvalds 已提交
3429 3430
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
3431
		if (nla_put_in6_addr(skb, RTA_SRC, src))
D
David S. Miller 已提交
3432
			goto nla_put_failure;
3433
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
3434
	} else if (rtm->rtm_src_len &&
3435
		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
D
David S. Miller 已提交
3436
		goto nla_put_failure;
L
Linus Torvalds 已提交
3437
#endif
3438 3439 3440
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3441 3442 3443 3444 3445 3446
			int err = ip6mr_get_route(net, skb, rtm, portid);

			if (err == 0)
				return 0;
			if (err < 0)
				goto nla_put_failure;
3447 3448
		} else
#endif
D
David S. Miller 已提交
3449 3450
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
3451
	} else if (dst) {
L
Linus Torvalds 已提交
3452
		struct in6_addr saddr_buf;
D
David S. Miller 已提交
3453
		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3454
		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
3455
			goto nla_put_failure;
L
Linus Torvalds 已提交
3456
	}
3457

3458 3459
	if (rt->rt6i_prefsrc.plen) {
		struct in6_addr saddr_buf;
A
Alexey Dobriyan 已提交
3460
		saddr_buf = rt->rt6i_prefsrc.addr;
3461
		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
3462
			goto nla_put_failure;
3463 3464
	}

3465 3466 3467 3468
	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
	if (rt->rt6i_pmtu)
		metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
	if (rtnetlink_put_metrics(skb, metrics) < 0)
3469 3470
		goto nla_put_failure;

D
David S. Miller 已提交
3471 3472
	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
		goto nla_put_failure;
3473

3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499
	/* For multipath routes, walk the siblings list and add
	 * each as a nexthop within RTA_MULTIPATH.
	 */
	if (rt->rt6i_nsiblings) {
		struct rt6_info *sibling, *next_sibling;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (!mp)
			goto nla_put_failure;

		if (rt6_add_nexthop(skb, rt) < 0)
			goto nla_put_failure;

		list_for_each_entry_safe(sibling, next_sibling,
					 &rt->rt6i_siblings, rt6i_siblings) {
			if (rt6_add_nexthop(skb, sibling) < 0)
				goto nla_put_failure;
		}

		nla_nest_end(skb, mp);
	} else {
		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags) < 0)
			goto nla_put_failure;
	}

3500
	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3501

3502
	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3503
		goto nla_put_failure;
3504

3505 3506 3507
	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
		goto nla_put_failure;

3508

3509 3510
	nlmsg_end(skb, nlh);
	return 0;
3511 3512

nla_put_failure:
3513 3514
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
3515 3516
}

3517
int rt6_dump_route(struct rt6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
3518 3519
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3520 3521 3522 3523
	struct net *net = arg->net;

	if (rt == net->ipv6.ip6_null_entry)
		return 0;
L
Linus Torvalds 已提交
3524

3525 3526
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3527 3528 3529 3530 3531 3532 3533 3534

		/* user wants prefix routes only */
		if (rtm->rtm_flags & RTM_F_PREFIX &&
		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
			/* success since this is not a prefix route */
			return 1;
		}
	}
L
Linus Torvalds 已提交
3535

3536
	return rt6_fill_node(net,
3537
		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3538
		     NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3539
		     NLM_F_MULTI);
L
Linus Torvalds 已提交
3540 3541
}

3542
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
L
Linus Torvalds 已提交
3543
{
3544
	struct net *net = sock_net(in_skb->sk);
3545 3546
	struct nlattr *tb[RTA_MAX+1];
	struct rt6_info *rt;
L
Linus Torvalds 已提交
3547
	struct sk_buff *skb;
3548
	struct rtmsg *rtm;
3549
	struct flowi6 fl6;
3550
	int err, iif = 0, oif = 0;
L
Linus Torvalds 已提交
3551

3552 3553 3554
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
3555

3556
	err = -EINVAL;
3557
	memset(&fl6, 0, sizeof(fl6));
3558 3559
	rtm = nlmsg_data(nlh);
	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
L
Linus Torvalds 已提交
3560

3561 3562 3563 3564
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
3565
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3566 3567 3568 3569 3570 3571
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
3572
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3573 3574 3575 3576 3577 3578
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
3579
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
3580

3581 3582 3583
	if (tb[RTA_MARK])
		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

3584 3585 3586 3587 3588 3589
	if (tb[RTA_UID])
		fl6.flowi6_uid = make_kuid(current_user_ns(),
					   nla_get_u32(tb[RTA_UID]));
	else
		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

L
Linus Torvalds 已提交
3590 3591
	if (iif) {
		struct net_device *dev;
3592 3593
		int flags = 0;

3594
		dev = __dev_get_by_index(net, iif);
L
Linus Torvalds 已提交
3595 3596
		if (!dev) {
			err = -ENODEV;
3597
			goto errout;
L
Linus Torvalds 已提交
3598
		}
3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
							       flags);
	} else {
		fl6.flowi6_oif = oif;

		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
L
Linus Torvalds 已提交
3611 3612
	}

3613
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3614
	if (!skb) {
A
Amerigo Wang 已提交
3615
		ip6_rt_put(rt);
3616 3617 3618
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
3619

3620
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
3621

3622
	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3623
			    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3624
			    nlh->nlmsg_seq, 0);
L
Linus Torvalds 已提交
3625
	if (err < 0) {
3626 3627
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
3628 3629
	}

3630
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3631
errout:
L
Linus Torvalds 已提交
3632 3633 3634
	return err;
}

3635 3636
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
		     unsigned int nlm_flags)
L
Linus Torvalds 已提交
3637 3638
{
	struct sk_buff *skb;
3639
	struct net *net = info->nl_net;
3640 3641 3642 3643
	u32 seq;
	int err;

	err = -ENOBUFS;
3644
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3645

3646
	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3647
	if (!skb)
3648 3649
		goto errout;

3650
	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3651
				event, info->portid, seq, nlm_flags);
3652 3653 3654 3655 3656 3657
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
3658
	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3659 3660
		    info->nlh, gfp_any());
	return;
3661 3662
errout:
	if (err < 0)
3663
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
3664 3665
}

3666
static int ip6_route_dev_notify(struct notifier_block *this,
3667
				unsigned long event, void *ptr)
3668
{
3669
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3670
	struct net *net = dev_net(dev);
3671 3672

	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
3673
		net->ipv6.ip6_null_entry->dst.dev = dev;
3674 3675
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
3676
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3677
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3678
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3679 3680 3681 3682 3683 3684 3685
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
3686 3687 3688 3689 3690 3691
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS

3692 3693 3694 3695 3696
static const struct file_operations ipv6_route_proc_fops = {
	.owner		= THIS_MODULE,
	.open		= ipv6_route_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
3697
	.release	= seq_release_net,
3698 3699
};

L
Linus Torvalds 已提交
3700 3701
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
3702
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
3703
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3704 3705 3706 3707 3708
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
		   net->ipv6.rt6_stats->fib_rt_alloc,
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
3709
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3710
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
3711 3712 3713 3714 3715 3716

	return 0;
}

static int rt6_stats_seq_open(struct inode *inode, struct file *file)
{
3717
	return single_open_net(inode, file, rt6_stats_seq_show);
3718 3719
}

3720
static const struct file_operations rt6_stats_seq_fops = {
L
Linus Torvalds 已提交
3721 3722 3723 3724
	.owner	 = THIS_MODULE,
	.open	 = rt6_stats_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
3725
	.release = single_release_net,
L
Linus Torvalds 已提交
3726 3727 3728 3729 3730 3731
};
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
3732
int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
L
Linus Torvalds 已提交
3733 3734
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
3735 3736 3737
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
3738
		return -EINVAL;
3739 3740 3741 3742

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
3743
	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3744
	return 0;
L
Linus Torvalds 已提交
3745 3746
}

3747
struct ctl_table ipv6_route_table_template[] = {
3748
	{
L
Linus Torvalds 已提交
3749
		.procname	=	"flush",
3750
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
3751
		.maxlen		=	sizeof(int),
3752
		.mode		=	0200,
A
Alexey Dobriyan 已提交
3753
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
3754 3755 3756
	},
	{
		.procname	=	"gc_thresh",
3757
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
3758 3759
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3760
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
3761 3762 3763
	},
	{
		.procname	=	"max_size",
3764
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
3765 3766
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3767
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
3768 3769 3770
	},
	{
		.procname	=	"gc_min_interval",
3771
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
3772 3773
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3774
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
3775 3776 3777
	},
	{
		.procname	=	"gc_timeout",
3778
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
3779 3780
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3781
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
3782 3783 3784
	},
	{
		.procname	=	"gc_interval",
3785
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
3786 3787
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3788
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
3789 3790 3791
	},
	{
		.procname	=	"gc_elasticity",
3792
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
3793 3794
		.maxlen		=	sizeof(int),
		.mode		=	0644,
3795
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
3796 3797 3798
	},
	{
		.procname	=	"mtu_expires",
3799
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
3800 3801
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3802
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
3803 3804 3805
	},
	{
		.procname	=	"min_adv_mss",
3806
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
3807 3808
		.maxlen		=	sizeof(int),
		.mode		=	0644,
3809
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
3810 3811 3812
	},
	{
		.procname	=	"gc_min_interval_ms",
3813
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
3814 3815
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
3816
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
3817
	},
3818
	{ }
L
Linus Torvalds 已提交
3819 3820
};

3821
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3822 3823 3824 3825 3826 3827
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
3828 3829 3830

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
3831
		table[0].extra1 = net;
3832
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3833 3834 3835 3836 3837 3838 3839
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3840
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3841 3842 3843 3844

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			table[0].procname = NULL;
3845 3846
	}

3847 3848
	return table;
}
L
Linus Torvalds 已提交
3849 3850
#endif

3851
static int __net_init ip6_route_net_init(struct net *net)
3852
{
3853
	int ret = -ENOMEM;
3854

3855 3856
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
3857

3858 3859 3860
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

3861 3862 3863 3864
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
3865
		goto out_ip6_dst_entries;
3866
	net->ipv6.ip6_null_entry->dst.path =
3867
		(struct dst_entry *)net->ipv6.ip6_null_entry;
3868
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3869 3870
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
3871 3872 3873 3874 3875

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
3876 3877
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
3878
	net->ipv6.ip6_prohibit_entry->dst.path =
3879
		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3880
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3881 3882
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
3883 3884 3885 3886

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
3887 3888
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
3889
	net->ipv6.ip6_blk_hole_entry->dst.path =
3890
		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3891
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3892 3893
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
3894 3895
#endif

3896 3897 3898 3899 3900 3901 3902 3903 3904
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

3905 3906
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

3907 3908 3909
	ret = 0;
out:
	return ret;
3910

3911 3912 3913 3914 3915 3916
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
3917 3918
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3919 3920
out_ip6_dst_ops:
	goto out;
3921 3922
}

3923
static void __net_exit ip6_route_net_exit(struct net *net)
3924
{
3925 3926 3927 3928 3929
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
3930
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3931 3932
}

3933 3934 3935
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
3936 3937
	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
	proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3938 3939 3940 3941 3942 3943 3944
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
3945 3946
	remove_proc_entry("ipv6_route", net->proc_net);
	remove_proc_entry("rt6_stats", net->proc_net);
3947 3948 3949
#endif
}

3950 3951 3952 3953 3954
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
3971
	inetpeer_invalidate_tree(bp);
3972 3973 3974
	kfree(bp);
}

3975
static struct pernet_operations ipv6_inetpeer_ops = {
3976 3977 3978 3979
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

3980 3981 3982 3983 3984
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

3985 3986 3987 3988 3989
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
	.priority = 0,
};

3990
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
3991
{
3992
	int ret;
3993
	int cpu;
3994

3995 3996
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
3997
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3998
				  SLAB_HWCACHE_ALIGN, NULL);
3999
	if (!ip6_dst_ops_template.kmem_cachep)
4000
		goto out;
4001

4002
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
4003
	if (ret)
4004 4005
		goto out_kmem_cache;

4006 4007
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
4008
		goto out_dst_entries;
4009

4010 4011 4012
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
4013

4014 4015
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

4016 4017 4018
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
4019
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4020 4021
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4022
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4023
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4024
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4025 4026
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
4027
	ret = fib6_init();
4028
	if (ret)
4029
		goto out_register_subsys;
4030 4031 4032

	ret = xfrm6_init();
	if (ret)
4033
		goto out_fib6_init;
4034

4035 4036 4037
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
4038

4039 4040 4041 4042
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

4043
	ret = -ENOBUFS;
4044 4045 4046
	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
4047
		goto out_register_late_subsys;
4048

4049
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4050
	if (ret)
4051
		goto out_register_late_subsys;
4052

4053 4054 4055 4056 4057 4058 4059
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

		INIT_LIST_HEAD(&ul->head);
		spin_lock_init(&ul->lock);
	}

4060 4061 4062
out:
	return ret;

4063 4064
out_register_late_subsys:
	unregister_pernet_subsys(&ip6_route_net_late_ops);
4065 4066 4067 4068
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
4069 4070
out_fib6_init:
	fib6_gc_cleanup();
4071 4072
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
4073 4074
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4075 4076
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
4077
out_kmem_cache:
4078
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4079
	goto out;
L
Linus Torvalds 已提交
4080 4081 4082 4083
}

void ip6_route_cleanup(void)
{
4084
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
4085
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
4086
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
4087 4088
	xfrm6_fini();
	fib6_gc_cleanup();
4089
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
4090
	unregister_pernet_subsys(&ip6_route_net_ops);
4091
	dst_entries_destroy(&ip6_dst_blackhole_ops);
4092
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
4093
}