route.c 72.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
58
#include <net/netevent.h>
59
#include <net/netlink.h>
L
Linus Torvalds 已提交
60 61 62 63 64 65 66

#include <asm/uaccess.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

67
static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
E
Eric Dumazet 已提交
68
				    const struct in6_addr *dest);
L
Linus Torvalds 已提交
69
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
70
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
71
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
72 73 74 75
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
76
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
77 78 79 80

static int		ip6_pkt_discard(struct sk_buff *skb);
static int		ip6_pkt_discard_out(struct sk_buff *skb);
static void		ip6_link_failure(struct sk_buff *skb);
81 82 83 84
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
L
Linus Torvalds 已提交
85

86
#ifdef CONFIG_IPV6_ROUTE_INFO
87
static struct rt6_info *rt6_add_route_info(struct net *net,
88 89
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
90
					   unsigned int pref);
91
static struct rt6_info *rt6_get_route_info(struct net *net,
92 93
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex);
94 95
#endif

96 97 98 99 100 101
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct inet_peer *peer;
	u32 *p = NULL;

102 103 104
	if (!(rt->dst.flags & DST_HOST))
		return NULL;

105
	peer = rt6_get_peer_create(rt);
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
	if (peer) {
		u32 *old_p = __DST_METRICS_PTR(old);
		unsigned long prev, new;

		p = peer->metrics;
		if (inet_metrics_new(peer))
			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			p = __DST_METRICS_PTR(prev);
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
		}
	}
	return p;
}

126 127 128
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
					     struct sk_buff *skb,
					     const void *daddr)
129 130 131
{
	struct in6_addr *p = &rt->rt6i_gateway;

D
David S. Miller 已提交
132
	if (!ipv6_addr_any(p))
133
		return (const void *) p;
134 135
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
136 137 138
	return daddr;
}

139 140 141
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
					  struct sk_buff *skb,
					  const void *daddr)
142
{
143 144 145
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct neighbour *n;

146
	daddr = choose_neigh_daddr(rt, skb, daddr);
147
	n = __ipv6_neigh_lookup(&nd_tbl, dst->dev, daddr);
148 149 150 151 152
	if (n)
		return n;
	return neigh_create(&nd_tbl, daddr, dst->dev);
}

153
static int rt6_bind_neighbour(struct rt6_info *rt, struct net_device *dev)
154
{
155 156 157 158 159 160
	struct neighbour *n = __ipv6_neigh_lookup(&nd_tbl, dev, &rt->rt6i_gateway);
	if (!n) {
		n = neigh_create(&nd_tbl, &rt->rt6i_gateway, dev);
		if (IS_ERR(n))
			return PTR_ERR(n);
	}
161
	rt->n = n;
162 163

	return 0;
164 165
}

166
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
167
	.family			=	AF_INET6,
168
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
L
Linus Torvalds 已提交
169 170 171
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
172
	.default_advmss		=	ip6_default_advmss,
173
	.mtu			=	ip6_mtu,
174
	.cow_metrics		=	ipv6_cow_metrics,
L
Linus Torvalds 已提交
175 176 177 178 179
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
180
	.redirect		=	rt6_do_redirect,
181
	.local_out		=	__ip6_local_out,
182
	.neigh_lookup		=	ip6_neigh_lookup,
L
Linus Torvalds 已提交
183 184
};

185
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
186
{
187 188 189
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
190 191
}

192 193
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
194 195 196
{
}

197 198
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
199 200 201
{
}

202 203 204 205 206 207
static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
					 unsigned long old)
{
	return NULL;
}

208 209
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
210
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
211 212
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
213
	.mtu			=	ip6_blackhole_mtu,
214
	.default_advmss		=	ip6_default_advmss,
215
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
216
	.redirect		=	ip6_rt_blackhole_redirect,
217
	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
218
	.neigh_lookup		=	ip6_neigh_lookup,
219 220
};

221 222 223 224
static const u32 ip6_template_metrics[RTAX_MAX] = {
	[RTAX_HOPLIMIT - 1] = 255,
};

225
static struct rt6_info ip6_null_entry_template = {
226 227 228 229 230 231 232
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
233 234
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
235
	.rt6i_protocol  = RTPROT_KERNEL,
L
Linus Torvalds 已提交
236 237 238 239
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

T
Thomas Graf 已提交
240 241
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

242 243 244
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct sk_buff *skb);

245
static struct rt6_info ip6_prohibit_entry_template = {
246 247 248 249 250 251 252
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
253 254
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
255
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
256 257 258 259
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

260
static struct rt6_info ip6_blk_hole_entry_template = {
261 262 263 264 265 266 267
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -EINVAL,
		.input		= dst_discard,
		.output		= dst_discard,
T
Thomas Graf 已提交
268 269
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
270
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
271 272 273 274 275 276
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

#endif

L
Linus Torvalds 已提交
277
/* allocate dst with ip6_dst_ops */
278
static inline struct rt6_info *ip6_dst_alloc(struct net *net,
279
					     struct net_device *dev,
280 281
					     int flags,
					     struct fib6_table *table)
L
Linus Torvalds 已提交
282
{
283
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
284
					0, DST_OBSOLETE_NONE, flags);
285

286
	if (rt) {
287 288 289
		struct dst_entry *dst = &rt->dst;

		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
290
		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
291
	}
292
	return rt;
L
Linus Torvalds 已提交
293 294 295 296 297 298 299
}

static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;

300 301 302
	if (rt->n)
		neigh_release(rt->n);

303 304 305
	if (!(rt->dst.flags & DST_HOST))
		dst_destroy_metrics_generic(dst);

306
	if (idev) {
L
Linus Torvalds 已提交
307 308
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
309
	}
310 311 312 313

	if (!(rt->rt6i_flags & RTF_EXPIRES) && dst->from)
		dst_release(dst->from);

314 315
	if (rt6_has_peer(rt)) {
		struct inet_peer *peer = rt6_peer_ptr(rt);
316 317 318 319
		inet_putpeer(peer);
	}
}

320 321 322 323 324 325 326
static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);

static u32 rt6_peer_genid(void)
{
	return atomic_read(&__rt6_peer_genid);
}

327 328
void rt6_bind_peer(struct rt6_info *rt, int create)
{
329
	struct inet_peer_base *base;
330 331
	struct inet_peer *peer;

332 333 334 335 336
	base = inetpeer_base_ptr(rt->_rt6i_peer);
	if (!base)
		return;

	peer = inet_getpeer_v6(base, &rt->rt6i_dst.addr, create);
337 338 339 340 341 342
	if (peer) {
		if (!rt6_set_peer(rt, peer))
			inet_putpeer(peer);
		else
			rt->rt6i_peer_genid = rt6_peer_genid();
	}
L
Linus Torvalds 已提交
343 344 345 346 347 348 349
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
350
	struct net_device *loopback_dev =
351
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
352

353 354 355 356 357 358 359 360 361 362 363 364 365
	if (dev != loopback_dev) {
		if (idev && idev->dev == dev) {
			struct inet6_dev *loopback_idev =
				in6_dev_get(loopback_dev);
			if (loopback_idev) {
				rt->rt6i_idev = loopback_idev;
				in6_dev_put(idev);
			}
		}
		if (rt->n && rt->n->dev == dev) {
			rt->n->dev = loopback_dev;
			dev_hold(loopback_dev);
			dev_put(dev);
L
Linus Torvalds 已提交
366 367 368 369
		}
	}
}

370
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
371
{
372 373 374 375
	struct rt6_info *ort = NULL;

	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
376
			return true;
377 378 379 380 381
	} else if (rt->dst.from) {
		ort = (struct rt6_info *) rt->dst.from;
		return (ort->rt6i_flags & RTF_EXPIRES) &&
			time_after(jiffies, ort->dst.expires);
	}
382
	return false;
L
Linus Torvalds 已提交
383 384
}

385
static bool rt6_need_strict(const struct in6_addr *daddr)
T
Thomas Graf 已提交
386
{
E
Eric Dumazet 已提交
387 388
	return ipv6_addr_type(daddr) &
		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
T
Thomas Graf 已提交
389 390
}

L
Linus Torvalds 已提交
391
/*
T
Thomas Graf 已提交
392
 *	Route lookup. Any table->tb6_lock is implied.
L
Linus Torvalds 已提交
393 394
 */

395 396
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
397
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
398
						    int oif,
399
						    int flags)
L
Linus Torvalds 已提交
400 401 402 403
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

404 405 406
	if (!oif && ipv6_addr_any(saddr))
		goto out;

407
	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
408
		struct net_device *dev = sprt->dst.dev;
409 410

		if (oif) {
L
Linus Torvalds 已提交
411 412 413
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
414
				if (!sprt->rt6i_idev ||
L
Linus Torvalds 已提交
415
				    sprt->rt6i_idev->dev->ifindex != oif) {
416
					if (flags & RT6_LOOKUP_F_IFACE && oif)
L
Linus Torvalds 已提交
417
						continue;
418
					if (local && (!oif ||
L
Linus Torvalds 已提交
419 420 421 422 423
						      local->rt6i_idev->dev->ifindex == oif))
						continue;
				}
				local = sprt;
			}
424 425 426 427
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
428
		}
429
	}
L
Linus Torvalds 已提交
430

431
	if (oif) {
L
Linus Torvalds 已提交
432 433 434
		if (local)
			return local;

435
		if (flags & RT6_LOOKUP_F_IFACE)
436
			return net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
437
	}
438
out:
L
Linus Torvalds 已提交
439 440 441
	return rt;
}

442 443 444
#ifdef CONFIG_IPV6_ROUTER_PREF
static void rt6_probe(struct rt6_info *rt)
{
445
	struct neighbour *neigh;
446 447 448 449 450 451 452 453
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
454
	rcu_read_lock();
455
	neigh = rt ? rt->n : NULL;
456
	if (!neigh || (neigh->nud_state & NUD_VALID))
457
		goto out;
458 459
	read_lock_bh(&neigh->lock);
	if (!(neigh->nud_state & NUD_VALID) &&
460
	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
461 462 463 464 465 466 467 468
		struct in6_addr mcaddr;
		struct in6_addr *target;

		neigh->updated = jiffies;
		read_unlock_bh(&neigh->lock);

		target = (struct in6_addr *)&neigh->primary_key;
		addrconf_addr_solict_mult(target, &mcaddr);
469
		ndisc_send_ns(rt->dst.dev, NULL, target, &mcaddr, NULL);
470
	} else {
471
		read_unlock_bh(&neigh->lock);
472 473 474
	}
out:
	rcu_read_unlock();
475 476 477 478 479 480 481
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

L
Linus Torvalds 已提交
482
/*
483
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
484
 */
D
Dave Jones 已提交
485
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
486
{
487
	struct net_device *dev = rt->dst.dev;
488
	if (!oif || dev->ifindex == oif)
489
		return 2;
490 491 492 493
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
494
}
L
Linus Torvalds 已提交
495

D
Dave Jones 已提交
496
static inline int rt6_check_neigh(struct rt6_info *rt)
L
Linus Torvalds 已提交
497
{
498
	struct neighbour *neigh;
499
	int m;
500 501

	rcu_read_lock();
502
	neigh = rt->n;
503 504 505 506
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
		m = 1;
	else if (neigh) {
507 508
		read_lock_bh(&neigh->lock);
		if (neigh->nud_state & NUD_VALID)
509
			m = 2;
510 511 512 513 514
#ifdef CONFIG_IPV6_ROUTER_PREF
		else if (neigh->nud_state & NUD_FAILED)
			m = 0;
#endif
		else
515
			m = 1;
516
		read_unlock_bh(&neigh->lock);
517 518
	} else
		m = 0;
519
	rcu_read_unlock();
520
	return m;
L
Linus Torvalds 已提交
521 522
}

523 524
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
L
Linus Torvalds 已提交
525
{
526
	int m, n;
527

528
	m = rt6_check_dev(rt, oif);
529
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
530
		return -1;
531 532 533
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
534
	n = rt6_check_neigh(rt);
535
	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
536 537 538 539
		return -1;
	return m;
}

540 541
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
				   int *mpri, struct rt6_info *match)
542
{
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569
	int m;

	if (rt6_check_expired(rt))
		goto out;

	m = rt6_score_route(rt, oif, strict);
	if (m < 0)
		goto out;

	if (m > *mpri) {
		if (strict & RT6_LOOKUP_F_REACHABLE)
			rt6_probe(match);
		*mpri = m;
		match = rt;
	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
		rt6_probe(rt);
	}

out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
				     struct rt6_info *rr_head,
				     u32 metric, int oif, int strict)
{
	struct rt6_info *rt, *match;
570
	int mpri = -1;
L
Linus Torvalds 已提交
571

572 573
	match = NULL;
	for (rt = rr_head; rt && rt->rt6i_metric == metric;
574
	     rt = rt->dst.rt6_next)
575 576
		match = find_match(rt, oif, strict, &mpri, match);
	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
577
	     rt = rt->dst.rt6_next)
578
		match = find_match(rt, oif, strict, &mpri, match);
L
Linus Torvalds 已提交
579

580 581
	return match;
}
L
Linus Torvalds 已提交
582

583 584 585
static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
{
	struct rt6_info *match, *rt0;
586
	struct net *net;
L
Linus Torvalds 已提交
587

588 589 590
	rt0 = fn->rr_ptr;
	if (!rt0)
		fn->rr_ptr = rt0 = fn->leaf;
L
Linus Torvalds 已提交
591

592
	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
L
Linus Torvalds 已提交
593

594
	if (!match &&
595
	    (strict & RT6_LOOKUP_F_REACHABLE)) {
596
		struct rt6_info *next = rt0->dst.rt6_next;
597

598
		/* no entries matched; do round-robin */
599 600 601 602 603
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
			next = fn->leaf;

		if (next != rt0)
			fn->rr_ptr = next;
L
Linus Torvalds 已提交
604 605
	}

606
	net = dev_net(rt0->dst.dev);
E
Eric Dumazet 已提交
607
	return match ? match : net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
608 609
}

610 611
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
612
		  const struct in6_addr *gwaddr)
613
{
614
	struct net *net = dev_net(dev);
615 616 617
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
618
	unsigned long lifetime;
619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
642
		return -EINVAL;
643

644
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
645 646 647 648 649 650 651 652 653 654 655

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

656 657
	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
				dev->ifindex);
658 659

	if (rt && !lifetime) {
660
		ip6_del_rt(rt);
661 662 663 664
		rt = NULL;
	}

	if (!rt && lifetime)
665
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
666 667 668 669 670 671
					pref);
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
672 673 674 675 676
		if (!addrconf_finite_timeout(lifetime))
			rt6_clean_expires(rt);
		else
			rt6_set_expires(rt, jiffies + HZ * lifetime);

677
		dst_release(&rt->dst);
678 679 680 681 682
	}
	return 0;
}
#endif

683
#define BACKTRACK(__net, saddr)			\
684
do { \
685
	if (rt == __net->ipv6.ip6_null_entry) {	\
686
		struct fib6_node *pn; \
V
Ville Nuorvala 已提交
687
		while (1) { \
688 689 690 691
			if (fn->fn_flags & RTN_TL_ROOT) \
				goto out; \
			pn = fn->parent; \
			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
692
				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
693 694 695 696
			else \
				fn = pn; \
			if (fn->fn_flags & RTN_RTINFO) \
				goto restart; \
T
Thomas Graf 已提交
697 698
		} \
	} \
699
} while (0)
T
Thomas Graf 已提交
700

701 702
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
703
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
704 705 706 707
{
	struct fib6_node *fn;
	struct rt6_info *rt;

T
Thomas Graf 已提交
708
	read_lock_bh(&table->tb6_lock);
709
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
710 711
restart:
	rt = fn->leaf;
712 713
	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
	BACKTRACK(net, &fl6->saddr);
T
Thomas Graf 已提交
714
out:
715
	dst_use(&rt->dst, jiffies);
T
Thomas Graf 已提交
716 717 718 719 720
	read_unlock_bh(&table->tb6_lock);
	return rt;

}

F
Florian Westphal 已提交
721 722 723 724 725 726 727
struct dst_entry * ip6_route_lookup(struct net *net, struct flowi6 *fl6,
				    int flags)
{
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

728 729
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
			    const struct in6_addr *saddr, int oif, int strict)
T
Thomas Graf 已提交
730
{
731 732 733
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
734 735
	};
	struct dst_entry *dst;
736
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
737

738
	if (saddr) {
739
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
740 741 742
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

743
	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
744 745 746 747 748
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
749 750 751
	return NULL;
}

752 753
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
754
/* ip6_ins_rt is called with FREE table->tb6_lock.
L
Linus Torvalds 已提交
755 756 757 758 759
   It takes new route entry, the addition fails by any reason the
   route is freed. In any case, if caller does not hold it, it may
   be destroyed.
 */

760
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
761 762
{
	int err;
T
Thomas Graf 已提交
763
	struct fib6_table *table;
L
Linus Torvalds 已提交
764

T
Thomas Graf 已提交
765 766
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
767
	err = fib6_add(&table->tb6_root, rt, info);
T
Thomas Graf 已提交
768
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
769 770 771 772

	return err;
}

773 774
int ip6_ins_rt(struct rt6_info *rt)
{
775
	struct nl_info info = {
776
		.nl_net = dev_net(rt->dst.dev),
777
	};
778
	return __ip6_ins_rt(rt, &info);
779 780
}

781
static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort,
E
Eric Dumazet 已提交
782
				      const struct in6_addr *daddr,
783
				      const struct in6_addr *saddr)
L
Linus Torvalds 已提交
784 785 786 787 788 789 790
{
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

E
Eric Dumazet 已提交
791
	rt = ip6_rt_copy(ort, daddr);
L
Linus Torvalds 已提交
792 793

	if (rt) {
794 795
		int attempts = !in_softirq();

796
		if (!(rt->rt6i_flags & RTF_GATEWAY)) {
797
			if (ort->rt6i_dst.plen != 128 &&
E
Eric Dumazet 已提交
798
			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
799
				rt->rt6i_flags |= RTF_ANYCAST;
A
Alexey Dobriyan 已提交
800
			rt->rt6i_gateway = *daddr;
801
		}
L
Linus Torvalds 已提交
802 803 804 805 806

		rt->rt6i_flags |= RTF_CACHE;

#ifdef CONFIG_IPV6_SUBTREES
		if (rt->rt6i_src.plen && saddr) {
A
Alexey Dobriyan 已提交
807
			rt->rt6i_src.addr = *saddr;
L
Linus Torvalds 已提交
808 809 810 811
			rt->rt6i_src.plen = 128;
		}
#endif

812
	retry:
813
		if (rt6_bind_neighbour(rt, rt->dst.dev)) {
814
			struct net *net = dev_net(rt->dst.dev);
815 816 817 818 819 820 821 822 823
			int saved_rt_min_interval =
				net->ipv6.sysctl.ip6_rt_gc_min_interval;
			int saved_rt_elasticity =
				net->ipv6.sysctl.ip6_rt_gc_elasticity;

			if (attempts-- > 0) {
				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;

824
				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
825 826 827 828 829 830 831 832

				net->ipv6.sysctl.ip6_rt_gc_elasticity =
					saved_rt_elasticity;
				net->ipv6.sysctl.ip6_rt_gc_min_interval =
					saved_rt_min_interval;
				goto retry;
			}

833
			net_warn_ratelimited("Neighbour table overflow\n");
834
			dst_free(&rt->dst);
835 836
			return NULL;
		}
837
	}
L
Linus Torvalds 已提交
838

839 840
	return rt;
}
L
Linus Torvalds 已提交
841

E
Eric Dumazet 已提交
842 843
static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
					const struct in6_addr *daddr)
844
{
E
Eric Dumazet 已提交
845 846
	struct rt6_info *rt = ip6_rt_copy(ort, daddr);

847 848
	if (rt) {
		rt->rt6i_flags |= RTF_CACHE;
849
		rt->n = neigh_clone(ort->n);
850 851 852 853
	}
	return rt;
}

854
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
855
				      struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
856 857
{
	struct fib6_node *fn;
858
	struct rt6_info *rt, *nrt;
T
Thomas Graf 已提交
859
	int strict = 0;
L
Linus Torvalds 已提交
860
	int attempts = 3;
861
	int err;
862
	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
L
Linus Torvalds 已提交
863

864
	strict |= flags & RT6_LOOKUP_F_IFACE;
L
Linus Torvalds 已提交
865 866

relookup:
T
Thomas Graf 已提交
867
	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
868

869
restart_2:
870
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
L
Linus Torvalds 已提交
871 872

restart:
873
	rt = rt6_select(fn, oif, strict | reachable);
874

875
	BACKTRACK(net, &fl6->saddr);
876
	if (rt == net->ipv6.ip6_null_entry ||
877
	    rt->rt6i_flags & RTF_CACHE)
878
		goto out;
L
Linus Torvalds 已提交
879

880
	dst_hold(&rt->dst);
T
Thomas Graf 已提交
881
	read_unlock_bh(&table->tb6_lock);
882

883
	if (!rt->n && !(rt->rt6i_flags & RTF_NONEXTHOP))
884
		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
885
	else if (!(rt->dst.flags & DST_HOST))
886
		nrt = rt6_alloc_clone(rt, &fl6->daddr);
887 888
	else
		goto out2;
889

890
	dst_release(&rt->dst);
891
	rt = nrt ? : net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
892

893
	dst_hold(&rt->dst);
894
	if (nrt) {
895
		err = ip6_ins_rt(nrt);
896
		if (!err)
L
Linus Torvalds 已提交
897 898 899
			goto out2;
	}

900 901 902 903
	if (--attempts <= 0)
		goto out2;

	/*
T
Thomas Graf 已提交
904
	 * Race condition! In the gap, when table->tb6_lock was
905 906
	 * released someone could insert this route.  Relookup.
	 */
907
	dst_release(&rt->dst);
908 909 910
	goto relookup;

out:
911 912 913 914
	if (reachable) {
		reachable = 0;
		goto restart_2;
	}
915
	dst_hold(&rt->dst);
T
Thomas Graf 已提交
916
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
917
out2:
918 919
	rt->dst.lastuse = jiffies;
	rt->dst.__use++;
T
Thomas Graf 已提交
920 921

	return rt;
L
Linus Torvalds 已提交
922 923
}

924
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
925
					    struct flowi6 *fl6, int flags)
926
{
927
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
928 929
}

930 931 932 933 934 935 936 937 938 939
static struct dst_entry *ip6_route_input_lookup(struct net *net,
						struct net_device *dev,
						struct flowi6 *fl6, int flags)
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
}

T
Thomas Graf 已提交
940 941
void ip6_route_input(struct sk_buff *skb)
{
942
	const struct ipv6hdr *iph = ipv6_hdr(skb);
943
	struct net *net = dev_net(skb->dev);
944
	int flags = RT6_LOOKUP_F_HAS_SADDR;
945 946 947 948
	struct flowi6 fl6 = {
		.flowi6_iif = skb->dev->ifindex,
		.daddr = iph->daddr,
		.saddr = iph->saddr,
949
		.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
950 951
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
952
	};
953

954
	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
T
Thomas Graf 已提交
955 956
}

957
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
958
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
959
{
960
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
T
Thomas Graf 已提交
961 962
}

963
struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
964
				    struct flowi6 *fl6)
T
Thomas Graf 已提交
965 966 967
{
	int flags = 0;

968 969
	fl6->flowi6_iif = net->loopback_dev->ifindex;

970
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
971
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
972

973
	if (!ipv6_addr_any(&fl6->saddr))
974
		flags |= RT6_LOOKUP_F_HAS_SADDR;
975 976
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
977

978
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
979 980
}

981
EXPORT_SYMBOL(ip6_route_output);
L
Linus Torvalds 已提交
982

983
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
984
{
985
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
986 987
	struct dst_entry *new = NULL;

988
	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
989
	if (rt) {
990
		new = &rt->dst;
991

992 993 994
		memset(new + 1, 0, sizeof(*rt) - sizeof(*new));
		rt6_init_peer(rt, net->ipv6.peers);

995
		new->__use = 1;
996 997
		new->input = dst_discard;
		new->output = dst_discard;
998

E
Eric Dumazet 已提交
999 1000 1001 1002
		if (dst_metrics_read_only(&ort->dst))
			new->_metrics = ort->dst._metrics;
		else
			dst_copy_metrics(new, &ort->dst);
1003 1004 1005 1006
		rt->rt6i_idev = ort->rt6i_idev;
		if (rt->rt6i_idev)
			in6_dev_hold(rt->rt6i_idev);

A
Alexey Dobriyan 已提交
1007
		rt->rt6i_gateway = ort->rt6i_gateway;
1008 1009
		rt->rt6i_flags = ort->rt6i_flags;
		rt6_clean_expires(rt);
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019
		rt->rt6i_metric = 0;

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif

		dst_free(new);
	}

1020 1021
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
1022 1023
}

L
Linus Torvalds 已提交
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
/*
 *	Destination cache support functions
 */

static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
	struct rt6_info *rt;

	rt = (struct rt6_info *) dst;

1034 1035
	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
1036
			if (!rt6_has_peer(rt))
1037 1038 1039
				rt6_bind_peer(rt, 0);
			rt->rt6i_peer_genid = rt6_peer_genid();
		}
L
Linus Torvalds 已提交
1040
		return dst;
1041
	}
L
Linus Torvalds 已提交
1042 1043 1044 1045 1046 1047 1048 1049
	return NULL;
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
1050 1051 1052 1053 1054 1055
		if (rt->rt6i_flags & RTF_CACHE) {
			if (rt6_check_expired(rt)) {
				ip6_del_rt(rt);
				dst = NULL;
			}
		} else {
L
Linus Torvalds 已提交
1056
			dst_release(dst);
1057 1058
			dst = NULL;
		}
L
Linus Torvalds 已提交
1059
	}
1060
	return dst;
L
Linus Torvalds 已提交
1061 1062 1063 1064 1065 1066
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

1067
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
1068

E
Eric Dumazet 已提交
1069
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
1070
	if (rt) {
1071 1072 1073
		if (rt->rt6i_flags & RTF_CACHE)
			rt6_update_expires(rt, 0);
		else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
L
Linus Torvalds 已提交
1074 1075 1076 1077
			rt->rt6i_node->fn_sernum = -1;
	}
}

1078 1079
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
L
Linus Torvalds 已提交
1080 1081 1082
{
	struct rt6_info *rt6 = (struct rt6_info*)dst;

1083
	dst_confirm(dst);
L
Linus Torvalds 已提交
1084
	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
1085 1086
		struct net *net = dev_net(dst->dev);

L
Linus Torvalds 已提交
1087 1088
		rt6->rt6i_flags |= RTF_MODIFIED;
		if (mtu < IPV6_MIN_MTU) {
1089
			u32 features = dst_metric(dst, RTAX_FEATURES);
L
Linus Torvalds 已提交
1090
			mtu = IPV6_MIN_MTU;
1091 1092
			features |= RTAX_FEATURE_ALLFRAG;
			dst_metric_set(dst, RTAX_FEATURES, features);
L
Linus Torvalds 已提交
1093
		}
1094
		dst_metric_set(dst, RTAX_MTU, mtu);
1095
		rt6_update_expires(rt6, net->ipv6.sysctl.ip6_rt_mtu_expires);
L
Linus Torvalds 已提交
1096 1097 1098
	}
}

1099 1100
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
		     int oif, u32 mark)
1101 1102 1103 1104 1105 1106 1107 1108
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
1109
	fl6.flowi6_flags = 0;
1110 1111 1112 1113 1114 1115
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
	fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
1116
		ip6_rt_update_pmtu(dst, NULL, skb, ntohl(mtu));
1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
	ip6_update_pmtu(skb, sock_net(sk), mtu,
			sk->sk_bound_dev_if, sk->sk_mark);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.flowi6_flags = 0;
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
	fl6.flowlabel = (*(__be32 *) iph) & IPV6_FLOWINFO_MASK;

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
1144
		rt6_do_redirect(dst, NULL, skb);
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

1155
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
1156
{
1157 1158 1159 1160
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
1161 1162
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

1163 1164
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
1165 1166

	/*
1167 1168 1169
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
1170 1171 1172 1173 1174 1175 1176
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

1177
static unsigned int ip6_mtu(const struct dst_entry *dst)
1178 1179
{
	struct inet6_dev *idev;
1180 1181 1182 1183 1184 1185
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	if (mtu)
		return mtu;

	mtu = IPV6_MIN_MTU;
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

	return mtu;
}

1196 1197
static struct dst_entry *icmp6_dst_gc_list;
static DEFINE_SPINLOCK(icmp6_dst_lock);
1198

1199
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
L
Linus Torvalds 已提交
1200
				  struct neighbour *neigh,
1201
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
1202
{
1203
	struct dst_entry *dst;
L
Linus Torvalds 已提交
1204 1205
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
1206
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
1207

1208
	if (unlikely(!idev))
E
Eric Dumazet 已提交
1209
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
1210

1211
	rt = ip6_dst_alloc(net, dev, 0, NULL);
1212
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
1213
		in6_dev_put(idev);
1214
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
1215 1216 1217 1218 1219
		goto out;
	}

	if (neigh)
		neigh_hold(neigh);
1220
	else {
1221
		neigh = ip6_neigh_lookup(&rt->dst, NULL, &fl6->daddr);
1222
		if (IS_ERR(neigh)) {
1223
			in6_dev_put(idev);
1224 1225 1226
			dst_free(&rt->dst);
			return ERR_CAST(neigh);
		}
1227
	}
L
Linus Torvalds 已提交
1228

1229 1230
	rt->dst.flags |= DST_HOST;
	rt->dst.output  = ip6_output;
1231
	rt->n = neigh;
1232
	atomic_set(&rt->dst.__refcnt, 1);
1233
	rt->rt6i_dst.addr = fl6->daddr;
1234 1235
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
1236
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
L
Linus Torvalds 已提交
1237

1238
	spin_lock_bh(&icmp6_dst_lock);
1239 1240
	rt->dst.next = icmp6_dst_gc_list;
	icmp6_dst_gc_list = &rt->dst;
1241
	spin_unlock_bh(&icmp6_dst_lock);
L
Linus Torvalds 已提交
1242

1243
	fib6_force_start_gc(net);
L
Linus Torvalds 已提交
1244

1245 1246
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
1247
out:
1248
	return dst;
L
Linus Torvalds 已提交
1249 1250
}

1251
int icmp6_dst_gc(void)
L
Linus Torvalds 已提交
1252
{
1253
	struct dst_entry *dst, **pprev;
1254
	int more = 0;
L
Linus Torvalds 已提交
1255

1256 1257
	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
1258

L
Linus Torvalds 已提交
1259 1260 1261 1262 1263 1264
	while ((dst = *pprev) != NULL) {
		if (!atomic_read(&dst->__refcnt)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
1265
			++more;
L
Linus Torvalds 已提交
1266 1267 1268
		}
	}

1269
	spin_unlock_bh(&icmp6_dst_lock);
1270

1271
	return more;
L
Linus Torvalds 已提交
1272 1273
}

1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
			    void *arg)
{
	struct dst_entry *dst, **pprev;

	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
	while ((dst = *pprev) != NULL) {
		struct rt6_info *rt = (struct rt6_info *) dst;
		if (func(rt, arg)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
		}
	}
	spin_unlock_bh(&icmp6_dst_lock);
}

1293
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
1294 1295
{
	unsigned long now = jiffies;
1296
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1297 1298 1299 1300 1301
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1302
	int entries;
1303

1304
	entries = dst_entries_get_fast(ops);
1305
	if (time_after(rt_last_gc + rt_min_interval, now) &&
1306
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
1307 1308
		goto out;

1309 1310 1311
	net->ipv6.ip6_rt_gc_expire++;
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
	net->ipv6.ip6_rt_last_gc = now;
1312 1313
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
1314
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
1315
out:
1316
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1317
	return entries > rt_max_size;
L
Linus Torvalds 已提交
1318 1319 1320 1321 1322 1323 1324 1325
}

/* Clean host part of a prefix. Not necessary in radix tree,
   but results in cleaner routing tables.

   Remove it only when all the things will work!
 */

1326
int ip6_dst_hoplimit(struct dst_entry *dst)
L
Linus Torvalds 已提交
1327
{
1328
	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1329
	if (hoplimit == 0) {
1330
		struct net_device *dev = dst->dev;
1331 1332 1333 1334 1335
		struct inet6_dev *idev;

		rcu_read_lock();
		idev = __in6_dev_get(dev);
		if (idev)
1336
			hoplimit = idev->cnf.hop_limit;
1337
		else
1338
			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1339
		rcu_read_unlock();
L
Linus Torvalds 已提交
1340 1341 1342
	}
	return hoplimit;
}
1343
EXPORT_SYMBOL(ip6_dst_hoplimit);
L
Linus Torvalds 已提交
1344 1345 1346 1347 1348

/*
 *
 */

1349
int ip6_route_add(struct fib6_config *cfg)
L
Linus Torvalds 已提交
1350 1351
{
	int err;
1352
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
1353 1354 1355
	struct rt6_info *rt = NULL;
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
1356
	struct fib6_table *table;
L
Linus Torvalds 已提交
1357 1358
	int addr_type;

1359
	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
L
Linus Torvalds 已提交
1360 1361
		return -EINVAL;
#ifndef CONFIG_IPV6_SUBTREES
1362
	if (cfg->fc_src_len)
L
Linus Torvalds 已提交
1363 1364
		return -EINVAL;
#endif
1365
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
1366
		err = -ENODEV;
1367
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
1368 1369 1370 1371 1372 1373 1374
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

1375 1376
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
1377

1378
	err = -ENOBUFS;
1379 1380
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1381
		table = fib6_get_table(net, cfg->fc_table);
1382
		if (!table) {
1383
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1384 1385 1386 1387 1388
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
1389 1390

	if (!table)
T
Thomas Graf 已提交
1391 1392
		goto out;

1393
	rt = ip6_dst_alloc(net, NULL, DST_NOCOUNT, table);
L
Linus Torvalds 已提交
1394

1395
	if (!rt) {
L
Linus Torvalds 已提交
1396 1397 1398 1399
		err = -ENOMEM;
		goto out;
	}

1400
	rt->dst.obsolete = -1;
1401 1402 1403 1404 1405 1406

	if (cfg->fc_flags & RTF_EXPIRES)
		rt6_set_expires(rt, jiffies +
				clock_t_to_jiffies(cfg->fc_expires));
	else
		rt6_clean_expires(rt);
L
Linus Torvalds 已提交
1407

1408 1409 1410 1411 1412
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
	rt->rt6i_protocol = cfg->fc_protocol;

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
1413 1414

	if (addr_type & IPV6_ADDR_MULTICAST)
1415
		rt->dst.input = ip6_mc_input;
1416 1417
	else if (cfg->fc_flags & RTF_LOCAL)
		rt->dst.input = ip6_input;
L
Linus Torvalds 已提交
1418
	else
1419
		rt->dst.input = ip6_forward;
L
Linus Torvalds 已提交
1420

1421
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
1422

1423 1424
	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->rt6i_dst.plen = cfg->fc_dst_len;
L
Linus Torvalds 已提交
1425
	if (rt->rt6i_dst.plen == 128)
1426
	       rt->dst.flags |= DST_HOST;
L
Linus Torvalds 已提交
1427

1428 1429 1430 1431 1432 1433 1434 1435
	if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
		u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!metrics) {
			err = -ENOMEM;
			goto out;
		}
		dst_init_metrics(&rt->dst, metrics, 0);
	}
L
Linus Torvalds 已提交
1436
#ifdef CONFIG_IPV6_SUBTREES
1437 1438
	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->rt6i_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
1439 1440
#endif

1441
	rt->rt6i_metric = cfg->fc_metric;
L
Linus Torvalds 已提交
1442 1443 1444 1445

	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
1446
	if ((cfg->fc_flags & RTF_REJECT) ||
1447 1448 1449
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
1450
		/* hold loopback dev/idev if we haven't done so. */
1451
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
1452 1453 1454 1455
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
1456
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
1457 1458 1459 1460 1461 1462 1463
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
1464 1465 1466
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		rt->dst.error = -ENETUNREACH;
L
Linus Torvalds 已提交
1467 1468 1469 1470
		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
		goto install_route;
	}

1471
	if (cfg->fc_flags & RTF_GATEWAY) {
1472
		const struct in6_addr *gw_addr;
L
Linus Torvalds 已提交
1473 1474
		int gwa_type;

1475
		gw_addr = &cfg->fc_gateway;
A
Alexey Dobriyan 已提交
1476
		rt->rt6i_gateway = *gw_addr;
L
Linus Torvalds 已提交
1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
		gwa_type = ipv6_addr_type(gw_addr);

		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
			struct rt6_info *grt;

			/* IPv6 strictly inhibits using not link-local
			   addresses as nexthop address.
			   Otherwise, router will not able to send redirects.
			   It is very good, but in some (rare!) circumstances
			   (SIT, PtP, NBMA NOARP links) it is handy to allow
			   some exceptions. --ANK
			 */
			err = -EINVAL;
1490
			if (!(gwa_type & IPV6_ADDR_UNICAST))
L
Linus Torvalds 已提交
1491 1492
				goto out;

1493
			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
L
Linus Torvalds 已提交
1494 1495

			err = -EHOSTUNREACH;
1496
			if (!grt)
L
Linus Torvalds 已提交
1497 1498
				goto out;
			if (dev) {
1499
				if (dev != grt->dst.dev) {
1500
					dst_release(&grt->dst);
L
Linus Torvalds 已提交
1501 1502 1503
					goto out;
				}
			} else {
1504
				dev = grt->dst.dev;
L
Linus Torvalds 已提交
1505 1506 1507 1508
				idev = grt->rt6i_idev;
				dev_hold(dev);
				in6_dev_hold(grt->rt6i_idev);
			}
1509
			if (!(grt->rt6i_flags & RTF_GATEWAY))
L
Linus Torvalds 已提交
1510
				err = 0;
1511
			dst_release(&grt->dst);
L
Linus Torvalds 已提交
1512 1513 1514 1515 1516

			if (err)
				goto out;
		}
		err = -EINVAL;
1517
		if (!dev || (dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
1518 1519 1520 1521
			goto out;
	}

	err = -ENODEV;
1522
	if (!dev)
L
Linus Torvalds 已提交
1523 1524
		goto out;

1525 1526 1527 1528 1529
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
			err = -EINVAL;
			goto out;
		}
A
Alexey Dobriyan 已提交
1530
		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1531 1532 1533 1534
		rt->rt6i_prefsrc.plen = 128;
	} else
		rt->rt6i_prefsrc.plen = 0;

1535
	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1536
		err = rt6_bind_neighbour(rt, dev);
1537
		if (err)
L
Linus Torvalds 已提交
1538 1539 1540
			goto out;
	}

1541
	rt->rt6i_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
1542 1543

install_route:
1544 1545 1546 1547 1548
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;

		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1549
			int type = nla_type(nla);
1550 1551 1552

			if (type) {
				if (type > RTAX_MAX) {
L
Linus Torvalds 已提交
1553 1554 1555
					err = -EINVAL;
					goto out;
				}
1556

1557
				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
L
Linus Torvalds 已提交
1558 1559 1560 1561
			}
		}
	}

1562
	rt->dst.dev = dev;
L
Linus Torvalds 已提交
1563
	rt->rt6i_idev = idev;
T
Thomas Graf 已提交
1564
	rt->rt6i_table = table;
1565

1566
	cfg->fc_nlinfo.nl_net = dev_net(dev);
1567

1568
	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
L
Linus Torvalds 已提交
1569 1570 1571 1572 1573 1574 1575

out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);
	if (rt)
1576
		dst_free(&rt->dst);
L
Linus Torvalds 已提交
1577 1578 1579
	return err;
}

1580
static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
1581 1582
{
	int err;
T
Thomas Graf 已提交
1583
	struct fib6_table *table;
1584
	struct net *net = dev_net(rt->dst.dev);
L
Linus Torvalds 已提交
1585

1586
	if (rt == net->ipv6.ip6_null_entry)
1587 1588
		return -ENOENT;

T
Thomas Graf 已提交
1589 1590
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1591

1592
	err = fib6_del(rt, info);
1593
	dst_release(&rt->dst);
L
Linus Torvalds 已提交
1594

T
Thomas Graf 已提交
1595
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1596 1597 1598 1599

	return err;
}

1600 1601
int ip6_del_rt(struct rt6_info *rt)
{
1602
	struct nl_info info = {
1603
		.nl_net = dev_net(rt->dst.dev),
1604
	};
1605
	return __ip6_del_rt(rt, &info);
1606 1607
}

1608
static int ip6_route_del(struct fib6_config *cfg)
L
Linus Torvalds 已提交
1609
{
T
Thomas Graf 已提交
1610
	struct fib6_table *table;
L
Linus Torvalds 已提交
1611 1612 1613 1614
	struct fib6_node *fn;
	struct rt6_info *rt;
	int err = -ESRCH;

1615
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1616
	if (!table)
T
Thomas Graf 已提交
1617 1618 1619
		return err;

	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1620

T
Thomas Graf 已提交
1621
	fn = fib6_locate(&table->tb6_root,
1622 1623
			 &cfg->fc_dst, cfg->fc_dst_len,
			 &cfg->fc_src, cfg->fc_src_len);
1624

L
Linus Torvalds 已提交
1625
	if (fn) {
1626
		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1627
			if (cfg->fc_ifindex &&
1628 1629
			    (!rt->dst.dev ||
			     rt->dst.dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
1630
				continue;
1631 1632
			if (cfg->fc_flags & RTF_GATEWAY &&
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
L
Linus Torvalds 已提交
1633
				continue;
1634
			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
L
Linus Torvalds 已提交
1635
				continue;
1636
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
1637
			read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1638

1639
			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
L
Linus Torvalds 已提交
1640 1641
		}
	}
T
Thomas Graf 已提交
1642
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1643 1644 1645 1646

	return err;
}

1647
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
1648
{
1649
	struct net *net = dev_net(skb->dev);
1650
	struct netevent_redirect netevent;
1651 1652 1653
	struct rt6_info *rt, *nrt = NULL;
	const struct in6_addr *target;
	struct ndisc_options ndopts;
1654 1655
	const struct in6_addr *dest;
	struct neighbour *old_neigh;
1656 1657 1658
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
	struct icmp6hdr *icmph;
1659 1660
	int optlen, on_link;
	u8 *lladdr;
1661 1662 1663 1664 1665

	optlen = skb->tail - skb->transport_header;
	optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr);

	if (optlen < 0) {
1666
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
1667 1668 1669 1670 1671 1672 1673 1674
		return;
	}

	icmph = icmp6_hdr(skb);
	target = (const struct in6_addr *) (icmph + 1);
	dest = target + 1;

	if (ipv6_addr_is_multicast(dest)) {
1675
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
1676 1677 1678
		return;
	}

1679
	on_link = 0;
1680 1681 1682 1683
	if (ipv6_addr_equal(dest, target)) {
		on_link = 1;
	} else if (ipv6_addr_type(target) !=
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
1684
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

	if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) {
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
1703 1704

	lladdr = NULL;
1705 1706 1707 1708 1709 1710 1711 1712 1713
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

1714 1715 1716
	rt = (struct rt6_info *) dst;
	if (rt == net->ipv6.ip6_null_entry) {
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
1717
		return;
1718
	}
1719

1720 1721 1722 1723 1724
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
	dst_confirm(&rt->dst);
1725

1726 1727 1728
	neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1);
	if (!neigh)
		return;
1729

1730 1731 1732
	/* Duplicate redirect: silently ignore. */
	old_neigh = rt->n;
	if (neigh == old_neigh)
1733
		goto out;
L
Linus Torvalds 已提交
1734 1735 1736 1737 1738

	/*
	 *	We have finally decided to accept it.
	 */

1739
	neigh_update(neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
1740 1741 1742 1743 1744 1745
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
				     NEIGH_UPDATE_F_ISROUTER))
		     );

E
Eric Dumazet 已提交
1746
	nrt = ip6_rt_copy(rt, dest);
1747
	if (!nrt)
L
Linus Torvalds 已提交
1748 1749 1750 1751 1752 1753
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
1754
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1755
	nrt->n = neigh_clone(neigh);
L
Linus Torvalds 已提交
1756

1757
	if (ip6_ins_rt(nrt))
L
Linus Torvalds 已提交
1758 1759
		goto out;

1760
	netevent.old = &rt->dst;
1761
	netevent.old_neigh = old_neigh;
1762
	netevent.new = &nrt->dst;
1763 1764
	netevent.new_neigh = neigh;
	netevent.daddr = dest;
1765 1766
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

1767
	if (rt->rt6i_flags & RTF_CACHE) {
1768
		rt = (struct rt6_info *) dst_clone(&rt->dst);
1769
		ip6_del_rt(rt);
L
Linus Torvalds 已提交
1770 1771 1772
	}

out:
1773
	neigh_release(neigh);
1774 1775
}

L
Linus Torvalds 已提交
1776 1777 1778 1779
/*
 *	Misc support functions
 */

1780
static struct rt6_info *ip6_rt_copy(struct rt6_info *ort,
E
Eric Dumazet 已提交
1781
				    const struct in6_addr *dest)
L
Linus Torvalds 已提交
1782
{
1783
	struct net *net = dev_net(ort->dst.dev);
1784 1785
	struct rt6_info *rt = ip6_dst_alloc(net, ort->dst.dev, 0,
					    ort->rt6i_table);
L
Linus Torvalds 已提交
1786 1787

	if (rt) {
1788 1789
		rt->dst.input = ort->dst.input;
		rt->dst.output = ort->dst.output;
1790
		rt->dst.flags |= DST_HOST;
1791

A
Alexey Dobriyan 已提交
1792
		rt->rt6i_dst.addr = *dest;
1793
		rt->rt6i_dst.plen = 128;
1794
		dst_copy_metrics(&rt->dst, &ort->dst);
1795
		rt->dst.error = ort->dst.error;
L
Linus Torvalds 已提交
1796 1797 1798
		rt->rt6i_idev = ort->rt6i_idev;
		if (rt->rt6i_idev)
			in6_dev_hold(rt->rt6i_idev);
1799
		rt->dst.lastuse = jiffies;
L
Linus Torvalds 已提交
1800

A
Alexey Dobriyan 已提交
1801
		rt->rt6i_gateway = ort->rt6i_gateway;
1802 1803 1804 1805 1806 1807
		rt->rt6i_flags = ort->rt6i_flags;
		if ((ort->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ==
		    (RTF_DEFAULT | RTF_ADDRCONF))
			rt6_set_from(rt, ort);
		else
			rt6_clean_expires(rt);
L
Linus Torvalds 已提交
1808 1809 1810 1811 1812
		rt->rt6i_metric = 0;

#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
1813
		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
T
Thomas Graf 已提交
1814
		rt->rt6i_table = ort->rt6i_table;
L
Linus Torvalds 已提交
1815 1816 1817 1818
	}
	return rt;
}

1819
#ifdef CONFIG_IPV6_ROUTE_INFO
1820
static struct rt6_info *rt6_get_route_info(struct net *net,
1821 1822
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex)
1823 1824 1825
{
	struct fib6_node *fn;
	struct rt6_info *rt = NULL;
T
Thomas Graf 已提交
1826 1827
	struct fib6_table *table;

1828
	table = fib6_get_table(net, RT6_TABLE_INFO);
1829
	if (!table)
T
Thomas Graf 已提交
1830
		return NULL;
1831

T
Thomas Graf 已提交
1832 1833
	write_lock_bh(&table->tb6_lock);
	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1834 1835 1836
	if (!fn)
		goto out;

1837
	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1838
		if (rt->dst.dev->ifindex != ifindex)
1839 1840 1841 1842 1843
			continue;
		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
			continue;
		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
			continue;
1844
		dst_hold(&rt->dst);
1845 1846 1847
		break;
	}
out:
T
Thomas Graf 已提交
1848
	write_unlock_bh(&table->tb6_lock);
1849 1850 1851
	return rt;
}

1852
static struct rt6_info *rt6_add_route_info(struct net *net,
1853 1854
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
1855
					   unsigned int pref)
1856
{
1857 1858
	struct fib6_config cfg = {
		.fc_table	= RT6_TABLE_INFO,
1859
		.fc_metric	= IP6_RT_PRIO_USER,
1860 1861 1862 1863
		.fc_ifindex	= ifindex,
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
1864 1865 1866
		.fc_nlinfo.pid = 0,
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
1867 1868
	};

A
Alexey Dobriyan 已提交
1869 1870
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
1871

1872 1873
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
1874
		cfg.fc_flags |= RTF_DEFAULT;
1875

1876
	ip6_route_add(&cfg);
1877

1878
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1879 1880 1881
}
#endif

1882
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1883
{
L
Linus Torvalds 已提交
1884
	struct rt6_info *rt;
T
Thomas Graf 已提交
1885
	struct fib6_table *table;
L
Linus Torvalds 已提交
1886

1887
	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1888
	if (!table)
T
Thomas Graf 已提交
1889
		return NULL;
L
Linus Torvalds 已提交
1890

T
Thomas Graf 已提交
1891
	write_lock_bh(&table->tb6_lock);
1892
	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
1893
		if (dev == rt->dst.dev &&
1894
		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
L
Linus Torvalds 已提交
1895 1896 1897 1898
		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
			break;
	}
	if (rt)
1899
		dst_hold(&rt->dst);
T
Thomas Graf 已提交
1900
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1901 1902 1903
	return rt;
}

1904
struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1905 1906
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
1907
{
1908 1909
	struct fib6_config cfg = {
		.fc_table	= RT6_TABLE_DFLT,
1910
		.fc_metric	= IP6_RT_PRIO_USER,
1911 1912 1913
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1914 1915
		.fc_nlinfo.pid = 0,
		.fc_nlinfo.nlh = NULL,
1916
		.fc_nlinfo.nl_net = dev_net(dev),
1917
	};
L
Linus Torvalds 已提交
1918

A
Alexey Dobriyan 已提交
1919
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
1920

1921
	ip6_route_add(&cfg);
L
Linus Torvalds 已提交
1922 1923 1924 1925

	return rt6_get_dflt_router(gwaddr, dev);
}

1926
void rt6_purge_dflt_routers(struct net *net)
L
Linus Torvalds 已提交
1927 1928
{
	struct rt6_info *rt;
T
Thomas Graf 已提交
1929 1930 1931
	struct fib6_table *table;

	/* NOTE: Keep consistent with rt6_get_dflt_router */
1932
	table = fib6_get_table(net, RT6_TABLE_DFLT);
1933
	if (!table)
T
Thomas Graf 已提交
1934
		return;
L
Linus Torvalds 已提交
1935 1936

restart:
T
Thomas Graf 已提交
1937
	read_lock_bh(&table->tb6_lock);
1938
	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
L
Linus Torvalds 已提交
1939
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1940
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
1941
			read_unlock_bh(&table->tb6_lock);
1942
			ip6_del_rt(rt);
L
Linus Torvalds 已提交
1943 1944 1945
			goto restart;
		}
	}
T
Thomas Graf 已提交
1946
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1947 1948
}

1949 1950
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = RT6_TABLE_MAIN;
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;

1963
	cfg->fc_nlinfo.nl_net = net;
1964

A
Alexey Dobriyan 已提交
1965 1966 1967
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
1968 1969
}

1970
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
1971
{
1972
	struct fib6_config cfg;
L
Linus Torvalds 已提交
1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
	struct in6_rtmsg rtmsg;
	int err;

	switch(cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
1985

1986
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1987

L
Linus Torvalds 已提交
1988 1989 1990
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
1991
			err = ip6_route_add(&cfg);
L
Linus Torvalds 已提交
1992 1993
			break;
		case SIOCDELRT:
1994
			err = ip6_route_del(&cfg);
L
Linus Torvalds 已提交
1995 1996 1997 1998 1999 2000 2001
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
2002
	}
L
Linus Torvalds 已提交
2003 2004 2005 2006 2007 2008 2009 2010

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

2011
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
2012
{
2013
	int type;
E
Eric Dumazet 已提交
2014
	struct dst_entry *dst = skb_dst(skb);
2015 2016
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
2017
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
2018
		if (type == IPV6_ADDR_ANY) {
2019 2020
			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
				      IPSTATS_MIB_INADDRERRORS);
2021 2022 2023 2024
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
2025 2026
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
2027 2028
		break;
	}
2029
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
2030 2031 2032 2033
	kfree_skb(skb);
	return 0;
}

2034 2035
static int ip6_pkt_discard(struct sk_buff *skb)
{
2036
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2037 2038
}

2039
static int ip6_pkt_discard_out(struct sk_buff *skb)
L
Linus Torvalds 已提交
2040
{
E
Eric Dumazet 已提交
2041
	skb->dev = skb_dst(skb)->dev;
2042
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
2043 2044
}

2045 2046
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

2047 2048
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
2049
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2050 2051 2052 2053
}

static int ip6_pkt_prohibit_out(struct sk_buff *skb)
{
E
Eric Dumazet 已提交
2054
	skb->dev = skb_dst(skb)->dev;
2055
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2056 2057
}

2058 2059
#endif

L
Linus Torvalds 已提交
2060 2061 2062 2063 2064 2065
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
				    const struct in6_addr *addr,
2066
				    bool anycast)
L
Linus Torvalds 已提交
2067
{
2068
	struct net *net = dev_net(idev->dev);
2069
	struct rt6_info *rt = ip6_dst_alloc(net, net->loopback_dev, 0, NULL);
2070
	int err;
L
Linus Torvalds 已提交
2071

2072
	if (!rt) {
2073
		net_warn_ratelimited("Maximum number of routes reached, consider increasing route/max_size\n");
L
Linus Torvalds 已提交
2074
		return ERR_PTR(-ENOMEM);
2075
	}
L
Linus Torvalds 已提交
2076 2077 2078

	in6_dev_hold(idev);

2079
	rt->dst.flags |= DST_HOST;
2080 2081
	rt->dst.input = ip6_input;
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
2082
	rt->rt6i_idev = idev;
2083
	rt->dst.obsolete = -1;
L
Linus Torvalds 已提交
2084 2085

	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2086 2087 2088
	if (anycast)
		rt->rt6i_flags |= RTF_ANYCAST;
	else
L
Linus Torvalds 已提交
2089
		rt->rt6i_flags |= RTF_LOCAL;
2090
	err = rt6_bind_neighbour(rt, rt->dst.dev);
2091
	if (err) {
2092
		dst_free(&rt->dst);
2093
		return ERR_PTR(err);
L
Linus Torvalds 已提交
2094 2095
	}

A
Alexey Dobriyan 已提交
2096
	rt->rt6i_dst.addr = *addr;
L
Linus Torvalds 已提交
2097
	rt->rt6i_dst.plen = 128;
2098
	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
L
Linus Torvalds 已提交
2099

2100
	atomic_set(&rt->dst.__refcnt, 1);
L
Linus Torvalds 已提交
2101 2102 2103 2104

	return rt;
}

2105 2106
int ip6_route_get_saddr(struct net *net,
			struct rt6_info *rt,
2107
			const struct in6_addr *daddr,
2108 2109 2110 2111 2112 2113
			unsigned int prefs,
			struct in6_addr *saddr)
{
	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
	int err = 0;
	if (rt->rt6i_prefsrc.plen)
A
Alexey Dobriyan 已提交
2114
		*saddr = rt->rt6i_prefsrc.addr;
2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133
	else
		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
					 daddr, prefs, saddr);
	return err;
}

/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

2134
	if (((void *)rt->dst.dev == dev || !dev) &&
2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153
	    rt != net->ipv6.ip6_null_entry &&
	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
		/* remove prefsrc entry */
		rt->rt6i_prefsrc.plen = 0;
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
}

2154 2155 2156 2157 2158
struct arg_dev_net {
	struct net_device *dev;
	struct net *net;
};

L
Linus Torvalds 已提交
2159 2160
static int fib6_ifdown(struct rt6_info *rt, void *arg)
{
S
stephen hemminger 已提交
2161 2162
	const struct arg_dev_net *adn = arg;
	const struct net_device *dev = adn->dev;
2163

2164
	if ((rt->dst.dev == dev || !dev) &&
2165
	    rt != adn->net->ipv6.ip6_null_entry)
L
Linus Torvalds 已提交
2166
		return -1;
2167

L
Linus Torvalds 已提交
2168 2169 2170
	return 0;
}

2171
void rt6_ifdown(struct net *net, struct net_device *dev)
L
Linus Torvalds 已提交
2172
{
2173 2174 2175 2176 2177 2178
	struct arg_dev_net adn = {
		.dev = dev,
		.net = net,
	};

	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2179
	icmp6_clean_all(fib6_ifdown, &adn);
L
Linus Torvalds 已提交
2180 2181
}

2182
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
2183
	struct net_device *dev;
2184
	unsigned int mtu;
L
Linus Torvalds 已提交
2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
};

static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
2199
	if (!idev)
L
Linus Torvalds 已提交
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
	/*
	   If new MTU is less than route PMTU, this new MTU will be the
	   lowest MTU in the path, update the route PMTU to reflect PMTU
	   decreases; if new MTU is greater than route PMTU, and the
	   old MTU is the lowest MTU in the path, update the route PMTU
	   to reflect the increase. In this case if the other nodes' MTU
	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
	   PMTU discouvery.
	 */
2216
	if (rt->dst.dev == arg->dev &&
2217 2218 2219 2220
	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
	    (dst_mtu(&rt->dst) >= arg->mtu ||
	     (dst_mtu(&rt->dst) < arg->mtu &&
	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2221
		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2222
	}
L
Linus Torvalds 已提交
2223 2224 2225
	return 0;
}

2226
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
2227
{
T
Thomas Graf 已提交
2228 2229 2230 2231
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
2232

2233
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
L
Linus Torvalds 已提交
2234 2235
}

2236
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2237
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2238
	[RTA_OIF]               = { .type = NLA_U32 },
2239
	[RTA_IIF]		= { .type = NLA_U32 },
2240 2241 2242 2243 2244 2245
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct fib6_config *cfg)
L
Linus Torvalds 已提交
2246
{
2247 2248 2249
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
	int err;
L
Linus Torvalds 已提交
2250

2251 2252 2253
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
2254

2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;

	if (rtm->rtm_type == RTN_UNREACHABLE)
		cfg->fc_flags |= RTF_REJECT;

2268 2269 2270
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

2271 2272
	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
2273
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2274 2275 2276 2277

	if (tb[RTA_GATEWAY]) {
		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
2278
	}
2279 2280 2281 2282 2283 2284 2285 2286

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
2287
	}
2288 2289 2290 2291 2292 2293 2294 2295

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
2296
	}
2297

2298 2299 2300
	if (tb[RTA_PREFSRC])
		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);

2301 2302 2303 2304 2305 2306 2307 2308 2309
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
2310
	}
2311 2312 2313 2314 2315 2316 2317

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
2318 2319
}

2320
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2321
{
2322 2323
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
2324

2325 2326 2327 2328 2329
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

	return ip6_route_del(&cfg);
L
Linus Torvalds 已提交
2330 2331
}

2332
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2333
{
2334 2335
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
2336

2337 2338 2339 2340 2341
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

	return ip6_route_add(&cfg);
L
Linus Torvalds 已提交
2342 2343
}

2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354
static inline size_t rt6_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
2355
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2356 2357 2358
	       + nla_total_size(sizeof(struct rta_cacheinfo));
}

2359 2360
static int rt6_fill_node(struct net *net,
			 struct sk_buff *skb, struct rt6_info *rt,
2361 2362
			 struct in6_addr *dst, struct in6_addr *src,
			 int iif, int type, u32 pid, u32 seq,
2363
			 int prefix, int nowait, unsigned int flags)
L
Linus Torvalds 已提交
2364 2365
{
	struct rtmsg *rtm;
2366
	struct nlmsghdr *nlh;
2367
	long expires;
2368
	u32 table;
2369
	struct neighbour *n;
L
Linus Torvalds 已提交
2370 2371 2372 2373 2374 2375 2376 2377

	if (prefix) {	/* user wants prefix routes only */
		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
			/* success since this is not a prefix route */
			return 1;
		}
	}

2378
	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2379
	if (!nlh)
2380
		return -EMSGSIZE;
2381 2382

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
2383 2384 2385 2386
	rtm->rtm_family = AF_INET6;
	rtm->rtm_dst_len = rt->rt6i_dst.plen;
	rtm->rtm_src_len = rt->rt6i_src.plen;
	rtm->rtm_tos = 0;
T
Thomas Graf 已提交
2387
	if (rt->rt6i_table)
2388
		table = rt->rt6i_table->tb6_id;
T
Thomas Graf 已提交
2389
	else
2390 2391
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
D
David S. Miller 已提交
2392 2393
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
2394
	if (rt->rt6i_flags & RTF_REJECT)
L
Linus Torvalds 已提交
2395
		rtm->rtm_type = RTN_UNREACHABLE;
2396
	else if (rt->rt6i_flags & RTF_LOCAL)
2397
		rtm->rtm_type = RTN_LOCAL;
2398
	else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
2399 2400 2401 2402 2403 2404
		rtm->rtm_type = RTN_LOCAL;
	else
		rtm->rtm_type = RTN_UNICAST;
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
	rtm->rtm_protocol = rt->rt6i_protocol;
2405
	if (rt->rt6i_flags & RTF_DYNAMIC)
L
Linus Torvalds 已提交
2406
		rtm->rtm_protocol = RTPROT_REDIRECT;
2407 2408 2409 2410 2411 2412
	else if (rt->rt6i_flags & RTF_ADDRCONF) {
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
			rtm->rtm_protocol = RTPROT_RA;
		else
			rtm->rtm_protocol = RTPROT_KERNEL;
	}
L
Linus Torvalds 已提交
2413

2414
	if (rt->rt6i_flags & RTF_CACHE)
L
Linus Torvalds 已提交
2415 2416 2417
		rtm->rtm_flags |= RTM_F_CLONED;

	if (dst) {
D
David S. Miller 已提交
2418 2419
		if (nla_put(skb, RTA_DST, 16, dst))
			goto nla_put_failure;
2420
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
2421
	} else if (rtm->rtm_dst_len)
D
David S. Miller 已提交
2422 2423
		if (nla_put(skb, RTA_DST, 16, &rt->rt6i_dst.addr))
			goto nla_put_failure;
L
Linus Torvalds 已提交
2424 2425
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
D
David S. Miller 已提交
2426 2427
		if (nla_put(skb, RTA_SRC, 16, src))
			goto nla_put_failure;
2428
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
2429 2430 2431
	} else if (rtm->rtm_src_len &&
		   nla_put(skb, RTA_SRC, 16, &rt->rt6i_src.addr))
		goto nla_put_failure;
L
Linus Torvalds 已提交
2432
#endif
2433 2434 2435
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2436
			int err = ip6mr_get_route(net, skb, rtm, nowait);
2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448
			if (err <= 0) {
				if (!nowait) {
					if (err == 0)
						return 0;
					goto nla_put_failure;
				} else {
					if (err == -EMSGSIZE)
						goto nla_put_failure;
				}
			}
		} else
#endif
D
David S. Miller 已提交
2449 2450
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
2451
	} else if (dst) {
L
Linus Torvalds 已提交
2452
		struct in6_addr saddr_buf;
D
David S. Miller 已提交
2453 2454 2455
		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
		    nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
			goto nla_put_failure;
L
Linus Torvalds 已提交
2456
	}
2457

2458 2459
	if (rt->rt6i_prefsrc.plen) {
		struct in6_addr saddr_buf;
A
Alexey Dobriyan 已提交
2460
		saddr_buf = rt->rt6i_prefsrc.addr;
D
David S. Miller 已提交
2461 2462
		if (nla_put(skb, RTA_PREFSRC, 16, &saddr_buf))
			goto nla_put_failure;
2463 2464
	}

2465
	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2466 2467
		goto nla_put_failure;

2468
	rcu_read_lock();
2469
	n = rt->n;
2470 2471 2472 2473 2474 2475
	if (n) {
		if (nla_put(skb, RTA_GATEWAY, 16, &n->primary_key) < 0) {
			rcu_read_unlock();
			goto nla_put_failure;
		}
	}
2476
	rcu_read_unlock();
2477

D
David S. Miller 已提交
2478 2479 2480 2481 2482
	if (rt->dst.dev &&
	    nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
		goto nla_put_failure;
	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
		goto nla_put_failure;
2483 2484

	expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
2485

2486
	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
2487
		goto nla_put_failure;
2488 2489 2490 2491

	return nlmsg_end(skb, nlh);

nla_put_failure:
2492 2493
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
2494 2495
}

2496
int rt6_dump_route(struct rt6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
2497 2498 2499 2500
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
	int prefix;

2501 2502
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
L
Linus Torvalds 已提交
2503 2504 2505 2506
		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
	} else
		prefix = 0;

2507 2508
	return rt6_fill_node(arg->net,
		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
L
Linus Torvalds 已提交
2509
		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2510
		     prefix, 0, NLM_F_MULTI);
L
Linus Torvalds 已提交
2511 2512
}

2513
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2514
{
2515
	struct net *net = sock_net(in_skb->sk);
2516 2517
	struct nlattr *tb[RTA_MAX+1];
	struct rt6_info *rt;
L
Linus Torvalds 已提交
2518
	struct sk_buff *skb;
2519
	struct rtmsg *rtm;
2520
	struct flowi6 fl6;
2521
	int err, iif = 0, oif = 0;
L
Linus Torvalds 已提交
2522

2523 2524 2525
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
2526

2527
	err = -EINVAL;
2528
	memset(&fl6, 0, sizeof(fl6));
L
Linus Torvalds 已提交
2529

2530 2531 2532 2533
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
2534
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2535 2536 2537 2538 2539 2540
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
2541
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2542 2543 2544 2545 2546 2547
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
2548
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
2549 2550 2551

	if (iif) {
		struct net_device *dev;
2552 2553
		int flags = 0;

2554
		dev = __dev_get_by_index(net, iif);
L
Linus Torvalds 已提交
2555 2556
		if (!dev) {
			err = -ENODEV;
2557
			goto errout;
L
Linus Torvalds 已提交
2558
		}
2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

		rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
							       flags);
	} else {
		fl6.flowi6_oif = oif;

		rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
L
Linus Torvalds 已提交
2571 2572
	}

2573
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2574
	if (!skb) {
2575
		dst_release(&rt->dst);
2576 2577 2578
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
2579

2580 2581 2582
	/* Reserve room for dummy headers, this skb can pass
	   through good chunk of routing engine.
	 */
2583
	skb_reset_mac_header(skb);
2584
	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
L
Linus Torvalds 已提交
2585

2586
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
2587

2588
	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
L
Linus Torvalds 已提交
2589
			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2590
			    nlh->nlmsg_seq, 0, 0, 0);
L
Linus Torvalds 已提交
2591
	if (err < 0) {
2592 2593
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
2594 2595
	}

2596
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2597
errout:
L
Linus Torvalds 已提交
2598 2599 2600
	return err;
}

2601
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
2602 2603
{
	struct sk_buff *skb;
2604
	struct net *net = info->nl_net;
2605 2606 2607 2608
	u32 seq;
	int err;

	err = -ENOBUFS;
2609
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2610

2611
	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2612
	if (!skb)
2613 2614
		goto errout;

2615
	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2616
				event, info->pid, seq, 0, 0, 0);
2617 2618 2619 2620 2621 2622
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
2623 2624 2625
	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
		    info->nlh, gfp_any());
	return;
2626 2627
errout:
	if (err < 0)
2628
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
2629 2630
}

2631 2632 2633 2634
static int ip6_route_dev_notify(struct notifier_block *this,
				unsigned long event, void *data)
{
	struct net_device *dev = (struct net_device *)data;
2635
	struct net *net = dev_net(dev);
2636 2637

	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2638
		net->ipv6.ip6_null_entry->dst.dev = dev;
2639 2640
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2641
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2642
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2643
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2644 2645 2646 2647 2648 2649 2650
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS

struct rt6_proc_arg
{
	char *buffer;
	int offset;
	int length;
	int skip;
	int len;
};

static int rt6_info_route(struct rt6_info *rt, void *p_arg)
{
2668
	struct seq_file *m = p_arg;
2669
	struct neighbour *n;
L
Linus Torvalds 已提交
2670

2671
	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
L
Linus Torvalds 已提交
2672 2673

#ifdef CONFIG_IPV6_SUBTREES
2674
	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
L
Linus Torvalds 已提交
2675
#else
2676
	seq_puts(m, "00000000000000000000000000000000 00 ");
L
Linus Torvalds 已提交
2677
#endif
2678
	rcu_read_lock();
2679
	n = rt->n;
2680 2681
	if (n) {
		seq_printf(m, "%pi6", n->primary_key);
L
Linus Torvalds 已提交
2682
	} else {
2683
		seq_puts(m, "00000000000000000000000000000000");
L
Linus Torvalds 已提交
2684
	}
2685
	rcu_read_unlock();
2686
	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2687 2688
		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
		   rt->dst.__use, rt->rt6i_flags,
2689
		   rt->dst.dev ? rt->dst.dev->name : "");
L
Linus Torvalds 已提交
2690 2691 2692
	return 0;
}

2693
static int ipv6_route_show(struct seq_file *m, void *v)
L
Linus Torvalds 已提交
2694
{
2695
	struct net *net = (struct net *)m->private;
2696
	fib6_clean_all_ro(net, rt6_info_route, 0, m);
2697 2698
	return 0;
}
L
Linus Torvalds 已提交
2699

2700 2701
static int ipv6_route_open(struct inode *inode, struct file *file)
{
2702
	return single_open_net(inode, file, ipv6_route_show);
2703 2704
}

2705 2706 2707 2708 2709
static const struct file_operations ipv6_route_proc_fops = {
	.owner		= THIS_MODULE,
	.open		= ipv6_route_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
2710
	.release	= single_release_net,
2711 2712
};

L
Linus Torvalds 已提交
2713 2714
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
2715
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
2716
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2717 2718 2719 2720 2721
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
		   net->ipv6.rt6_stats->fib_rt_alloc,
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
2722
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2723
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
2724 2725 2726 2727 2728 2729

	return 0;
}

static int rt6_stats_seq_open(struct inode *inode, struct file *file)
{
2730
	return single_open_net(inode, file, rt6_stats_seq_show);
2731 2732
}

2733
static const struct file_operations rt6_stats_seq_fops = {
L
Linus Torvalds 已提交
2734 2735 2736 2737
	.owner	 = THIS_MODULE,
	.open	 = rt6_stats_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
2738
	.release = single_release_net,
L
Linus Torvalds 已提交
2739 2740 2741 2742 2743 2744
};
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
2745
int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
L
Linus Torvalds 已提交
2746 2747
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
2748 2749 2750
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
2751
		return -EINVAL;
2752 2753 2754 2755 2756 2757

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
	return 0;
L
Linus Torvalds 已提交
2758 2759
}

2760
ctl_table ipv6_route_table_template[] = {
2761
	{
L
Linus Torvalds 已提交
2762
		.procname	=	"flush",
2763
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
2764
		.maxlen		=	sizeof(int),
2765
		.mode		=	0200,
A
Alexey Dobriyan 已提交
2766
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
2767 2768 2769
	},
	{
		.procname	=	"gc_thresh",
2770
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
2771 2772
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2773
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2774 2775 2776
	},
	{
		.procname	=	"max_size",
2777
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
2778 2779
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2780
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2781 2782 2783
	},
	{
		.procname	=	"gc_min_interval",
2784
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
2785 2786
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2787
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2788 2789 2790
	},
	{
		.procname	=	"gc_timeout",
2791
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
2792 2793
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2794
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2795 2796 2797
	},
	{
		.procname	=	"gc_interval",
2798
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
2799 2800
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2801
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2802 2803 2804
	},
	{
		.procname	=	"gc_elasticity",
2805
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
2806 2807
		.maxlen		=	sizeof(int),
		.mode		=	0644,
2808
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2809 2810 2811
	},
	{
		.procname	=	"mtu_expires",
2812
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
2813 2814
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2815
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2816 2817 2818
	},
	{
		.procname	=	"min_adv_mss",
2819
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
2820 2821
		.maxlen		=	sizeof(int),
		.mode		=	0644,
2822
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2823 2824 2825
	},
	{
		.procname	=	"gc_min_interval_ms",
2826
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
2827 2828
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2829
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
2830
	},
2831
	{ }
L
Linus Torvalds 已提交
2832 2833
};

2834
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2835 2836 2837 2838 2839 2840
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
2841 2842 2843

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
2844
		table[0].extra1 = net;
2845
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2846 2847 2848 2849 2850 2851 2852
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2853
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2854 2855
	}

2856 2857
	return table;
}
L
Linus Torvalds 已提交
2858 2859
#endif

2860
static int __net_init ip6_route_net_init(struct net *net)
2861
{
2862
	int ret = -ENOMEM;
2863

2864 2865
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
2866

2867 2868 2869
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

2870 2871 2872 2873
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
2874
		goto out_ip6_dst_entries;
2875
	net->ipv6.ip6_null_entry->dst.path =
2876
		(struct dst_entry *)net->ipv6.ip6_null_entry;
2877
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878 2879
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
2880 2881 2882 2883 2884

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
2885 2886
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
2887
	net->ipv6.ip6_prohibit_entry->dst.path =
2888
		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2889
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2890 2891
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
2892 2893 2894 2895

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
2896 2897
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
2898
	net->ipv6.ip6_blk_hole_entry->dst.path =
2899
		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2900
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2901 2902
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
2903 2904
#endif

2905 2906 2907 2908 2909 2910 2911 2912 2913
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

2914 2915
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

2916 2917 2918
	ret = 0;
out:
	return ret;
2919

2920 2921 2922 2923 2924 2925
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
2926 2927
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2928 2929
out_ip6_dst_ops:
	goto out;
2930 2931
}

2932
static void __net_exit ip6_route_net_exit(struct net *net)
2933
{
2934 2935 2936 2937 2938
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
2939
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2940 2941
}

2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
	proc_net_remove(net, "ipv6_route");
	proc_net_remove(net, "rt6_stats");
#endif
}

2959 2960 2961 2962 2963
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
2980
	inetpeer_invalidate_tree(bp);
2981 2982 2983
	kfree(bp);
}

2984
static struct pernet_operations ipv6_inetpeer_ops = {
2985 2986 2987 2988
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

2989 2990 2991 2992 2993
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

2994 2995 2996 2997 2998
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
	.priority = 0,
};

2999
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
3000
{
3001 3002
	int ret;

3003 3004
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
3005
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3006
				  SLAB_HWCACHE_ALIGN, NULL);
3007
	if (!ip6_dst_ops_template.kmem_cachep)
3008
		goto out;
3009

3010
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
3011
	if (ret)
3012 3013
		goto out_kmem_cache;

3014 3015
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
3016
		goto out_dst_entries;
3017

3018 3019 3020
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
3021

3022 3023
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

3024 3025 3026
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
3027
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3028 3029
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3030
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3031
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3032
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3033 3034
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
3035
	ret = fib6_init();
3036
	if (ret)
3037
		goto out_register_subsys;
3038 3039 3040

	ret = xfrm6_init();
	if (ret)
3041
		goto out_fib6_init;
3042

3043 3044 3045
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
3046

3047 3048 3049 3050
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

3051
	ret = -ENOBUFS;
3052 3053 3054
	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3055
		goto out_register_late_subsys;
3056

3057
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3058
	if (ret)
3059
		goto out_register_late_subsys;
3060

3061 3062 3063
out:
	return ret;

3064 3065
out_register_late_subsys:
	unregister_pernet_subsys(&ip6_route_net_late_ops);
3066 3067 3068 3069
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
3070 3071
out_fib6_init:
	fib6_gc_cleanup();
3072 3073
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
3074 3075
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3076 3077
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
3078
out_kmem_cache:
3079
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3080
	goto out;
L
Linus Torvalds 已提交
3081 3082 3083 3084
}

void ip6_route_cleanup(void)
{
3085
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
3086
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
3087
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
3088 3089
	xfrm6_fini();
	fib6_gc_cleanup();
3090
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
3091
	unregister_pernet_subsys(&ip6_route_net_ops);
3092
	dst_entries_destroy(&ip6_dst_blackhole_ops);
3093
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
3094
}