route.c 72.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27
#include <linux/capability.h>
L
Linus Torvalds 已提交
28
#include <linux/errno.h>
29
#include <linux/export.h>
L
Linus Torvalds 已提交
30 31 32 33 34 35 36 37
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
38
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
39 40 41 42
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
43
#include <linux/nsproxy.h>
44
#include <linux/slab.h>
45
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/xfrm.h>
56
#include <net/netevent.h>
57
#include <net/netlink.h>
L
Linus Torvalds 已提交
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75

#include <asm/uaccess.h>

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

/* Set to 3 to get tracing. */
#define RT6_DEBUG 2

#if RT6_DEBUG >= 3
#define RDBG(x) printk x
#define RT6_TRACE(x...) printk(KERN_DEBUG x)
#else
#define RDBG(x)
#define RT6_TRACE(x...) do { ; } while (0)
#endif

E
Eric Dumazet 已提交
76 77
static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
				    const struct in6_addr *dest);
L
Linus Torvalds 已提交
78
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
79
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
80
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
81 82 83 84
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
85
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
86 87 88 89 90 91

static int		ip6_pkt_discard(struct sk_buff *skb);
static int		ip6_pkt_discard_out(struct sk_buff *skb);
static void		ip6_link_failure(struct sk_buff *skb);
static void		ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);

92
#ifdef CONFIG_IPV6_ROUTE_INFO
93
static struct rt6_info *rt6_add_route_info(struct net *net,
94 95
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
96
					   unsigned pref);
97
static struct rt6_info *rt6_get_route_info(struct net *net,
98 99
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex);
100 101
#endif

102 103 104 105 106 107
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct inet_peer *peer;
	u32 *p = NULL;

108 109 110
	if (!(rt->dst.flags & DST_HOST))
		return NULL;

111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
	if (!rt->rt6i_peer)
		rt6_bind_peer(rt, 1);

	peer = rt->rt6i_peer;
	if (peer) {
		u32 *old_p = __DST_METRICS_PTR(old);
		unsigned long prev, new;

		p = peer->metrics;
		if (inet_metrics_new(peer))
			memcpy(p, old_p, sizeof(u32) * RTAX_MAX);

		new = (unsigned long) p;
		prev = cmpxchg(&dst->_metrics, old, new);

		if (prev != old) {
			p = __DST_METRICS_PTR(prev);
			if (prev & DST_METRICS_READ_ONLY)
				p = NULL;
		}
	}
	return p;
}

135 136 137 138 139
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst, const void *daddr)
{
	return __neigh_lookup_errno(&nd_tbl, daddr, dst->dev);
}

140
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
141
	.family			=	AF_INET6,
142
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
L
Linus Torvalds 已提交
143 144 145
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
146
	.default_advmss		=	ip6_default_advmss,
147
	.mtu			=	ip6_mtu,
148
	.cow_metrics		=	ipv6_cow_metrics,
L
Linus Torvalds 已提交
149 150 151 152 153
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
154
	.local_out		=	__ip6_local_out,
155
	.neigh_lookup		=	ip6_neigh_lookup,
L
Linus Torvalds 已提交
156 157
};

158
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
159
{
160 161 162
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
163 164
}

165 166 167 168
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
{
}

169 170 171 172 173 174
static u32 *ip6_rt_blackhole_cow_metrics(struct dst_entry *dst,
					 unsigned long old)
{
	return NULL;
}

175 176
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
177
	.protocol		=	cpu_to_be16(ETH_P_IPV6),
178 179
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
180
	.mtu			=	ip6_blackhole_mtu,
181
	.default_advmss		=	ip6_default_advmss,
182
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
183
	.cow_metrics		=	ip6_rt_blackhole_cow_metrics,
184
	.neigh_lookup		=	ip6_neigh_lookup,
185 186
};

187 188 189 190
static const u32 ip6_template_metrics[RTAX_MAX] = {
	[RTAX_HOPLIMIT - 1] = 255,
};

191
static struct rt6_info ip6_null_entry_template = {
192 193 194 195 196 197 198
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
199 200
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
201
	.rt6i_protocol  = RTPROT_KERNEL,
L
Linus Torvalds 已提交
202 203 204 205
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

T
Thomas Graf 已提交
206 207
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

208 209 210
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct sk_buff *skb);

211
static struct rt6_info ip6_prohibit_entry_template = {
212 213 214 215 216 217 218
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
219 220
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
221
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
222 223 224 225
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

226
static struct rt6_info ip6_blk_hole_entry_template = {
227 228 229 230 231 232 233
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
		.obsolete	= -1,
		.error		= -EINVAL,
		.input		= dst_discard,
		.output		= dst_discard,
T
Thomas Graf 已提交
234 235
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
236
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
237 238 239 240 241 242
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
};

#endif

L
Linus Torvalds 已提交
243
/* allocate dst with ip6_dst_ops */
244
static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops,
245 246
					     struct net_device *dev,
					     int flags)
L
Linus Torvalds 已提交
247
{
248
	struct rt6_info *rt = dst_alloc(ops, dev, 0, 0, flags);
249

250
	if (rt)
251
		memset(&rt->rt6i_table, 0,
252
		       sizeof(*rt) - sizeof(struct dst_entry));
253 254

	return rt;
L
Linus Torvalds 已提交
255 256 257 258 259 260
}

static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
261
	struct inet_peer *peer = rt->rt6i_peer;
L
Linus Torvalds 已提交
262

263 264 265
	if (!(rt->dst.flags & DST_HOST))
		dst_destroy_metrics_generic(dst);

266
	if (idev) {
L
Linus Torvalds 已提交
267 268
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
269
	}
270 271 272 273 274 275
	if (peer) {
		rt->rt6i_peer = NULL;
		inet_putpeer(peer);
	}
}

276 277 278 279 280 281 282
static atomic_t __rt6_peer_genid = ATOMIC_INIT(0);

static u32 rt6_peer_genid(void)
{
	return atomic_read(&__rt6_peer_genid);
}

283 284 285 286 287 288 289
void rt6_bind_peer(struct rt6_info *rt, int create)
{
	struct inet_peer *peer;

	peer = inet_getpeer_v6(&rt->rt6i_dst.addr, create);
	if (peer && cmpxchg(&rt->rt6i_peer, NULL, peer) != NULL)
		inet_putpeer(peer);
290 291
	else
		rt->rt6i_peer_genid = rt6_peer_genid();
L
Linus Torvalds 已提交
292 293 294 295 296 297 298
}

static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
299
	struct net_device *loopback_dev =
300
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
301

302
	if (dev != loopback_dev && idev && idev->dev == dev) {
303 304
		struct inet6_dev *loopback_idev =
			in6_dev_get(loopback_dev);
305
		if (loopback_idev) {
L
Linus Torvalds 已提交
306 307 308 309 310 311 312 313
			rt->rt6i_idev = loopback_idev;
			in6_dev_put(idev);
		}
	}
}

static __inline__ int rt6_check_expired(const struct rt6_info *rt)
{
E
Eric Dumazet 已提交
314 315
	return (rt->rt6i_flags & RTF_EXPIRES) &&
		time_after(jiffies, rt->rt6i_expires);
L
Linus Torvalds 已提交
316 317
}

318
static inline int rt6_need_strict(const struct in6_addr *daddr)
T
Thomas Graf 已提交
319
{
E
Eric Dumazet 已提交
320 321
	return ipv6_addr_type(daddr) &
		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
T
Thomas Graf 已提交
322 323
}

L
Linus Torvalds 已提交
324
/*
T
Thomas Graf 已提交
325
 *	Route lookup. Any table->tb6_lock is implied.
L
Linus Torvalds 已提交
326 327
 */

328 329
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
330
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
331
						    int oif,
332
						    int flags)
L
Linus Torvalds 已提交
333 334 335 336
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

337 338 339
	if (!oif && ipv6_addr_any(saddr))
		goto out;

340
	for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
341 342 343
		struct net_device *dev = sprt->rt6i_dev;

		if (oif) {
L
Linus Torvalds 已提交
344 345 346
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
347
				if (!sprt->rt6i_idev ||
L
Linus Torvalds 已提交
348
				    sprt->rt6i_idev->dev->ifindex != oif) {
349
					if (flags & RT6_LOOKUP_F_IFACE && oif)
L
Linus Torvalds 已提交
350
						continue;
351
					if (local && (!oif ||
L
Linus Torvalds 已提交
352 353 354 355 356
						      local->rt6i_idev->dev->ifindex == oif))
						continue;
				}
				local = sprt;
			}
357 358 359 360
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
361
		}
362
	}
L
Linus Torvalds 已提交
363

364
	if (oif) {
L
Linus Torvalds 已提交
365 366 367
		if (local)
			return local;

368
		if (flags & RT6_LOOKUP_F_IFACE)
369
			return net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
370
	}
371
out:
L
Linus Torvalds 已提交
372 373 374
	return rt;
}

375 376 377
#ifdef CONFIG_IPV6_ROUTER_PREF
static void rt6_probe(struct rt6_info *rt)
{
378
	struct neighbour *neigh;
379 380 381 382 383 384 385 386
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
387 388
	rcu_read_lock();
	neigh = rt ? dst_get_neighbour(&rt->dst) : NULL;
389
	if (!neigh || (neigh->nud_state & NUD_VALID))
390
		goto out;
391 392
	read_lock_bh(&neigh->lock);
	if (!(neigh->nud_state & NUD_VALID) &&
393
	    time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
394 395 396 397 398 399 400 401 402
		struct in6_addr mcaddr;
		struct in6_addr *target;

		neigh->updated = jiffies;
		read_unlock_bh(&neigh->lock);

		target = (struct in6_addr *)&neigh->primary_key;
		addrconf_addr_solict_mult(target, &mcaddr);
		ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
403
	} else {
404
		read_unlock_bh(&neigh->lock);
405 406 407
	}
out:
	rcu_read_unlock();
408 409 410 411 412 413 414
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

L
Linus Torvalds 已提交
415
/*
416
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
417
 */
D
Dave Jones 已提交
418
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
419 420
{
	struct net_device *dev = rt->rt6i_dev;
421
	if (!oif || dev->ifindex == oif)
422
		return 2;
423 424 425 426
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
427
}
L
Linus Torvalds 已提交
428

D
Dave Jones 已提交
429
static inline int rt6_check_neigh(struct rt6_info *rt)
L
Linus Torvalds 已提交
430
{
431
	struct neighbour *neigh;
432
	int m;
433 434 435

	rcu_read_lock();
	neigh = dst_get_neighbour(&rt->dst);
436 437 438 439
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
		m = 1;
	else if (neigh) {
440 441
		read_lock_bh(&neigh->lock);
		if (neigh->nud_state & NUD_VALID)
442
			m = 2;
443 444 445 446 447
#ifdef CONFIG_IPV6_ROUTER_PREF
		else if (neigh->nud_state & NUD_FAILED)
			m = 0;
#endif
		else
448
			m = 1;
449
		read_unlock_bh(&neigh->lock);
450 451
	} else
		m = 0;
452
	rcu_read_unlock();
453
	return m;
L
Linus Torvalds 已提交
454 455
}

456 457
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
L
Linus Torvalds 已提交
458
{
459
	int m, n;
460

461
	m = rt6_check_dev(rt, oif);
462
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
463
		return -1;
464 465 466
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
467
	n = rt6_check_neigh(rt);
468
	if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
469 470 471 472
		return -1;
	return m;
}

473 474
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
				   int *mpri, struct rt6_info *match)
475
{
476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
	int m;

	if (rt6_check_expired(rt))
		goto out;

	m = rt6_score_route(rt, oif, strict);
	if (m < 0)
		goto out;

	if (m > *mpri) {
		if (strict & RT6_LOOKUP_F_REACHABLE)
			rt6_probe(match);
		*mpri = m;
		match = rt;
	} else if (strict & RT6_LOOKUP_F_REACHABLE) {
		rt6_probe(rt);
	}

out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
				     struct rt6_info *rr_head,
				     u32 metric, int oif, int strict)
{
	struct rt6_info *rt, *match;
503
	int mpri = -1;
L
Linus Torvalds 已提交
504

505 506
	match = NULL;
	for (rt = rr_head; rt && rt->rt6i_metric == metric;
507
	     rt = rt->dst.rt6_next)
508 509
		match = find_match(rt, oif, strict, &mpri, match);
	for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
510
	     rt = rt->dst.rt6_next)
511
		match = find_match(rt, oif, strict, &mpri, match);
L
Linus Torvalds 已提交
512

513 514
	return match;
}
L
Linus Torvalds 已提交
515

516 517 518
static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
{
	struct rt6_info *match, *rt0;
519
	struct net *net;
L
Linus Torvalds 已提交
520

521
	RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
522
		  __func__, fn->leaf, oif);
523

524 525 526
	rt0 = fn->rr_ptr;
	if (!rt0)
		fn->rr_ptr = rt0 = fn->leaf;
L
Linus Torvalds 已提交
527

528
	match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
L
Linus Torvalds 已提交
529

530
	if (!match &&
531
	    (strict & RT6_LOOKUP_F_REACHABLE)) {
532
		struct rt6_info *next = rt0->dst.rt6_next;
533

534
		/* no entries matched; do round-robin */
535 536 537 538 539
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
			next = fn->leaf;

		if (next != rt0)
			fn->rr_ptr = next;
L
Linus Torvalds 已提交
540 541
	}

542
	RT6_TRACE("%s() => %p\n",
543
		  __func__, match);
L
Linus Torvalds 已提交
544

545
	net = dev_net(rt0->rt6i_dev);
E
Eric Dumazet 已提交
546
	return match ? match : net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
547 548
}

549 550
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
551
		  const struct in6_addr *gwaddr)
552
{
553
	struct net *net = dev_net(dev);
554 555 556
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
557
	unsigned long lifetime;
558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
581
		return -EINVAL;
582

583
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
584 585 586 587 588 589 590 591 592 593 594

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

595 596
	rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
				dev->ifindex);
597 598

	if (rt && !lifetime) {
599
		ip6_del_rt(rt);
600 601 602 603
		rt = NULL;
	}

	if (!rt && lifetime)
604
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
605 606 607 608 609 610
					pref);
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
611
		if (!addrconf_finite_timeout(lifetime)) {
612 613 614 615 616
			rt->rt6i_flags &= ~RTF_EXPIRES;
		} else {
			rt->rt6i_expires = jiffies + HZ * lifetime;
			rt->rt6i_flags |= RTF_EXPIRES;
		}
617
		dst_release(&rt->dst);
618 619 620 621 622
	}
	return 0;
}
#endif

623
#define BACKTRACK(__net, saddr)			\
624
do { \
625
	if (rt == __net->ipv6.ip6_null_entry) {	\
626
		struct fib6_node *pn; \
V
Ville Nuorvala 已提交
627
		while (1) { \
628 629 630 631
			if (fn->fn_flags & RTN_TL_ROOT) \
				goto out; \
			pn = fn->parent; \
			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
632
				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
633 634 635 636
			else \
				fn = pn; \
			if (fn->fn_flags & RTN_RTINFO) \
				goto restart; \
T
Thomas Graf 已提交
637 638
		} \
	} \
639
} while (0)
T
Thomas Graf 已提交
640

641 642
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
643
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
644 645 646 647
{
	struct fib6_node *fn;
	struct rt6_info *rt;

T
Thomas Graf 已提交
648
	read_lock_bh(&table->tb6_lock);
649
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
650 651
restart:
	rt = fn->leaf;
652 653
	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
	BACKTRACK(net, &fl6->saddr);
T
Thomas Graf 已提交
654
out:
655
	dst_use(&rt->dst, jiffies);
T
Thomas Graf 已提交
656 657 658 659 660
	read_unlock_bh(&table->tb6_lock);
	return rt;

}

661 662
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
			    const struct in6_addr *saddr, int oif, int strict)
T
Thomas Graf 已提交
663
{
664 665 666
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
667 668
	};
	struct dst_entry *dst;
669
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
670

671
	if (saddr) {
672
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
673 674 675
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

676
	dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
677 678 679 680 681
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
682 683 684
	return NULL;
}

685 686
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
687
/* ip6_ins_rt is called with FREE table->tb6_lock.
L
Linus Torvalds 已提交
688 689 690 691 692
   It takes new route entry, the addition fails by any reason the
   route is freed. In any case, if caller does not hold it, it may
   be destroyed.
 */

693
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
694 695
{
	int err;
T
Thomas Graf 已提交
696
	struct fib6_table *table;
L
Linus Torvalds 已提交
697

T
Thomas Graf 已提交
698 699
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
700
	err = fib6_add(&table->tb6_root, rt, info);
T
Thomas Graf 已提交
701
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
702 703 704 705

	return err;
}

706 707
int ip6_ins_rt(struct rt6_info *rt)
{
708
	struct nl_info info = {
709
		.nl_net = dev_net(rt->rt6i_dev),
710
	};
711
	return __ip6_ins_rt(rt, &info);
712 713
}

E
Eric Dumazet 已提交
714 715
static struct rt6_info *rt6_alloc_cow(const struct rt6_info *ort,
				      const struct in6_addr *daddr,
716
				      const struct in6_addr *saddr)
L
Linus Torvalds 已提交
717 718 719 720 721 722 723
{
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

E
Eric Dumazet 已提交
724
	rt = ip6_rt_copy(ort, daddr);
L
Linus Torvalds 已提交
725 726

	if (rt) {
727 728 729
		struct neighbour *neigh;
		int attempts = !in_softirq();

730
		if (!(rt->rt6i_flags & RTF_GATEWAY)) {
731
			if (rt->rt6i_dst.plen != 128 &&
E
Eric Dumazet 已提交
732
			    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
733
				rt->rt6i_flags |= RTF_ANYCAST;
A
Alexey Dobriyan 已提交
734
			rt->rt6i_gateway = *daddr;
735
		}
L
Linus Torvalds 已提交
736 737 738 739 740

		rt->rt6i_flags |= RTF_CACHE;

#ifdef CONFIG_IPV6_SUBTREES
		if (rt->rt6i_src.plen && saddr) {
A
Alexey Dobriyan 已提交
741
			rt->rt6i_src.addr = *saddr;
L
Linus Torvalds 已提交
742 743 744 745
			rt->rt6i_src.plen = 128;
		}
#endif

746
	retry:
747 748
		neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway,
					     rt->rt6i_dev);
749 750 751 752 753 754 755 756 757 758 759
		if (IS_ERR(neigh)) {
			struct net *net = dev_net(rt->rt6i_dev);
			int saved_rt_min_interval =
				net->ipv6.sysctl.ip6_rt_gc_min_interval;
			int saved_rt_elasticity =
				net->ipv6.sysctl.ip6_rt_gc_elasticity;

			if (attempts-- > 0) {
				net->ipv6.sysctl.ip6_rt_gc_elasticity = 1;
				net->ipv6.sysctl.ip6_rt_gc_min_interval = 0;

760
				ip6_dst_gc(&net->ipv6.ip6_dst_ops);
761 762 763 764 765 766 767 768 769 770

				net->ipv6.sysctl.ip6_rt_gc_elasticity =
					saved_rt_elasticity;
				net->ipv6.sysctl.ip6_rt_gc_min_interval =
					saved_rt_min_interval;
				goto retry;
			}

			if (net_ratelimit())
				printk(KERN_WARNING
771
				       "ipv6: Neighbour table overflow.\n");
772
			dst_free(&rt->dst);
773 774
			return NULL;
		}
775
		dst_set_neighbour(&rt->dst, neigh);
L
Linus Torvalds 已提交
776

777
	}
L
Linus Torvalds 已提交
778

779 780
	return rt;
}
L
Linus Torvalds 已提交
781

E
Eric Dumazet 已提交
782 783
static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
					const struct in6_addr *daddr)
784
{
E
Eric Dumazet 已提交
785 786
	struct rt6_info *rt = ip6_rt_copy(ort, daddr);

787 788
	if (rt) {
		rt->rt6i_flags |= RTF_CACHE;
789
		dst_set_neighbour(&rt->dst, neigh_clone(dst_get_neighbour_raw(&ort->dst)));
790 791 792 793
	}
	return rt;
}

794
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
795
				      struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
796 797
{
	struct fib6_node *fn;
798
	struct rt6_info *rt, *nrt;
T
Thomas Graf 已提交
799
	int strict = 0;
L
Linus Torvalds 已提交
800
	int attempts = 3;
801
	int err;
802
	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
L
Linus Torvalds 已提交
803

804
	strict |= flags & RT6_LOOKUP_F_IFACE;
L
Linus Torvalds 已提交
805 806

relookup:
T
Thomas Graf 已提交
807
	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
808

809
restart_2:
810
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
L
Linus Torvalds 已提交
811 812

restart:
813
	rt = rt6_select(fn, oif, strict | reachable);
814

815
	BACKTRACK(net, &fl6->saddr);
816
	if (rt == net->ipv6.ip6_null_entry ||
817
	    rt->rt6i_flags & RTF_CACHE)
818
		goto out;
L
Linus Torvalds 已提交
819

820
	dst_hold(&rt->dst);
T
Thomas Graf 已提交
821
	read_unlock_bh(&table->tb6_lock);
822

823
	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
824
		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
825
	else if (!(rt->dst.flags & DST_HOST))
826
		nrt = rt6_alloc_clone(rt, &fl6->daddr);
827 828
	else
		goto out2;
829

830
	dst_release(&rt->dst);
831
	rt = nrt ? : net->ipv6.ip6_null_entry;
L
Linus Torvalds 已提交
832

833
	dst_hold(&rt->dst);
834
	if (nrt) {
835
		err = ip6_ins_rt(nrt);
836
		if (!err)
L
Linus Torvalds 已提交
837 838 839
			goto out2;
	}

840 841 842 843
	if (--attempts <= 0)
		goto out2;

	/*
T
Thomas Graf 已提交
844
	 * Race condition! In the gap, when table->tb6_lock was
845 846
	 * released someone could insert this route.  Relookup.
	 */
847
	dst_release(&rt->dst);
848 849 850
	goto relookup;

out:
851 852 853 854
	if (reachable) {
		reachable = 0;
		goto restart_2;
	}
855
	dst_hold(&rt->dst);
T
Thomas Graf 已提交
856
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
857
out2:
858 859
	rt->dst.lastuse = jiffies;
	rt->dst.__use++;
T
Thomas Graf 已提交
860 861

	return rt;
L
Linus Torvalds 已提交
862 863
}

864
static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
865
					    struct flowi6 *fl6, int flags)
866
{
867
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
868 869
}

T
Thomas Graf 已提交
870 871
void ip6_route_input(struct sk_buff *skb)
{
872
	const struct ipv6hdr *iph = ipv6_hdr(skb);
873
	struct net *net = dev_net(skb->dev);
874
	int flags = RT6_LOOKUP_F_HAS_SADDR;
875 876 877 878
	struct flowi6 fl6 = {
		.flowi6_iif = skb->dev->ifindex,
		.daddr = iph->daddr,
		.saddr = iph->saddr,
879
		.flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK,
880 881
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
882
	};
883

T
Thomas Goff 已提交
884
	if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG)
885
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
886

887
	skb_dst_set(skb, fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input));
T
Thomas Graf 已提交
888 889
}

890
static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
891
					     struct flowi6 *fl6, int flags)
L
Linus Torvalds 已提交
892
{
893
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
T
Thomas Graf 已提交
894 895
}

896
struct dst_entry * ip6_route_output(struct net *net, const struct sock *sk,
897
				    struct flowi6 *fl6)
T
Thomas Graf 已提交
898 899 900
{
	int flags = 0;

901
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr))
902
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
903

904
	if (!ipv6_addr_any(&fl6->saddr))
905
		flags |= RT6_LOOKUP_F_HAS_SADDR;
906 907
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
908

909
	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
910 911
}

912
EXPORT_SYMBOL(ip6_route_output);
L
Linus Torvalds 已提交
913

914
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
915
{
916
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
917 918
	struct dst_entry *new = NULL;

919
	rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, 0, 0);
920
	if (rt) {
921 922
		memset(&rt->rt6i_table, 0, sizeof(*rt) - sizeof(struct dst_entry));

923
		new = &rt->dst;
924 925

		new->__use = 1;
926 927
		new->input = dst_discard;
		new->output = dst_discard;
928

E
Eric Dumazet 已提交
929 930 931 932
		if (dst_metrics_read_only(&ort->dst))
			new->_metrics = ort->dst._metrics;
		else
			dst_copy_metrics(new, &ort->dst);
933 934 935 936 937
		rt->rt6i_idev = ort->rt6i_idev;
		if (rt->rt6i_idev)
			in6_dev_hold(rt->rt6i_idev);
		rt->rt6i_expires = 0;

A
Alexey Dobriyan 已提交
938
		rt->rt6i_gateway = ort->rt6i_gateway;
939 940 941 942 943 944 945 946 947 948 949
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
		rt->rt6i_metric = 0;

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif

		dst_free(new);
	}

950 951
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
952 953
}

L
Linus Torvalds 已提交
954 955 956 957 958 959 960 961 962 963
/*
 *	Destination cache support functions
 */

static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
	struct rt6_info *rt;

	rt = (struct rt6_info *) dst;

964 965 966 967 968 969
	if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) {
		if (rt->rt6i_peer_genid != rt6_peer_genid()) {
			if (!rt->rt6i_peer)
				rt6_bind_peer(rt, 0);
			rt->rt6i_peer_genid = rt6_peer_genid();
		}
L
Linus Torvalds 已提交
970
		return dst;
971
	}
L
Linus Torvalds 已提交
972 973 974 975 976 977 978 979
	return NULL;
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
980 981 982 983 984 985
		if (rt->rt6i_flags & RTF_CACHE) {
			if (rt6_check_expired(rt)) {
				ip6_del_rt(rt);
				dst = NULL;
			}
		} else {
L
Linus Torvalds 已提交
986
			dst_release(dst);
987 988
			dst = NULL;
		}
L
Linus Torvalds 已提交
989
	}
990
	return dst;
L
Linus Torvalds 已提交
991 992 993 994 995 996
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

997
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
998

E
Eric Dumazet 已提交
999
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
1000
	if (rt) {
1001
		if (rt->rt6i_flags & RTF_CACHE) {
1002
			dst_set_expires(&rt->dst, 0);
L
Linus Torvalds 已提交
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
			rt->rt6i_flags |= RTF_EXPIRES;
		} else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
			rt->rt6i_node->fn_sernum = -1;
	}
}

static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
{
	struct rt6_info *rt6 = (struct rt6_info*)dst;

	if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
		rt6->rt6i_flags |= RTF_MODIFIED;
		if (mtu < IPV6_MIN_MTU) {
1016
			u32 features = dst_metric(dst, RTAX_FEATURES);
L
Linus Torvalds 已提交
1017
			mtu = IPV6_MIN_MTU;
1018 1019
			features |= RTAX_FEATURE_ALLFRAG;
			dst_metric_set(dst, RTAX_FEATURES, features);
L
Linus Torvalds 已提交
1020
		}
1021
		dst_metric_set(dst, RTAX_MTU, mtu);
L
Linus Torvalds 已提交
1022 1023 1024
	}
}

1025
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
1026
{
1027 1028 1029 1030
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
1031 1032
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

1033 1034
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
1035 1036

	/*
1037 1038 1039
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
1040 1041 1042 1043 1044 1045 1046
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

1047
static unsigned int ip6_mtu(const struct dst_entry *dst)
1048 1049
{
	struct inet6_dev *idev;
1050 1051 1052 1053 1054 1055
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	if (mtu)
		return mtu;

	mtu = IPV6_MIN_MTU;
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

	return mtu;
}

1066 1067
static struct dst_entry *icmp6_dst_gc_list;
static DEFINE_SPINLOCK(icmp6_dst_lock);
1068

1069
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
L
Linus Torvalds 已提交
1070
				  struct neighbour *neigh,
1071
				  const struct in6_addr *addr)
L
Linus Torvalds 已提交
1072 1073 1074
{
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
1075
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
1076

1077
	if (unlikely(!idev))
L
Linus Torvalds 已提交
1078 1079
		return NULL;

1080
	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0);
1081
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
1082 1083 1084 1085 1086 1087
		in6_dev_put(idev);
		goto out;
	}

	if (neigh)
		neigh_hold(neigh);
1088
	else {
1089
		neigh = __neigh_lookup_errno(&nd_tbl, addr, dev);
1090 1091 1092
		if (IS_ERR(neigh))
			neigh = NULL;
	}
L
Linus Torvalds 已提交
1093

1094 1095
	rt->dst.flags |= DST_HOST;
	rt->dst.output  = ip6_output;
1096
	dst_set_neighbour(&rt->dst, neigh);
1097
	atomic_set(&rt->dst.__refcnt, 1);
A
Alexey Dobriyan 已提交
1098
	rt->rt6i_dst.addr = *addr;
1099 1100
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
1101
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 255);
L
Linus Torvalds 已提交
1102

1103
	spin_lock_bh(&icmp6_dst_lock);
1104 1105
	rt->dst.next = icmp6_dst_gc_list;
	icmp6_dst_gc_list = &rt->dst;
1106
	spin_unlock_bh(&icmp6_dst_lock);
L
Linus Torvalds 已提交
1107

1108
	fib6_force_start_gc(net);
L
Linus Torvalds 已提交
1109 1110

out:
1111
	return &rt->dst;
L
Linus Torvalds 已提交
1112 1113
}

1114
int icmp6_dst_gc(void)
L
Linus Torvalds 已提交
1115
{
1116
	struct dst_entry *dst, **pprev;
1117
	int more = 0;
L
Linus Torvalds 已提交
1118

1119 1120
	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
1121

L
Linus Torvalds 已提交
1122 1123 1124 1125 1126 1127
	while ((dst = *pprev) != NULL) {
		if (!atomic_read(&dst->__refcnt)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
1128
			++more;
L
Linus Torvalds 已提交
1129 1130 1131
		}
	}

1132
	spin_unlock_bh(&icmp6_dst_lock);
1133

1134
	return more;
L
Linus Torvalds 已提交
1135 1136
}

1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155
static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
			    void *arg)
{
	struct dst_entry *dst, **pprev;

	spin_lock_bh(&icmp6_dst_lock);
	pprev = &icmp6_dst_gc_list;
	while ((dst = *pprev) != NULL) {
		struct rt6_info *rt = (struct rt6_info *) dst;
		if (func(rt, arg)) {
			*pprev = dst->next;
			dst_free(dst);
		} else {
			pprev = &dst->next;
		}
	}
	spin_unlock_bh(&icmp6_dst_lock);
}

1156
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
1157 1158
{
	unsigned long now = jiffies;
1159
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1160 1161 1162 1163 1164
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1165
	int entries;
1166

1167
	entries = dst_entries_get_fast(ops);
1168
	if (time_after(rt_last_gc + rt_min_interval, now) &&
1169
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
1170 1171
		goto out;

1172 1173 1174
	net->ipv6.ip6_rt_gc_expire++;
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
	net->ipv6.ip6_rt_last_gc = now;
1175 1176
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
1177
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
1178
out:
1179
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1180
	return entries > rt_max_size;
L
Linus Torvalds 已提交
1181 1182 1183 1184 1185 1186 1187 1188
}

/* Clean host part of a prefix. Not necessary in radix tree,
   but results in cleaner routing tables.

   Remove it only when all the things will work!
 */

1189
int ip6_dst_hoplimit(struct dst_entry *dst)
L
Linus Torvalds 已提交
1190
{
1191
	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
1192
	if (hoplimit == 0) {
1193
		struct net_device *dev = dst->dev;
1194 1195 1196 1197 1198
		struct inet6_dev *idev;

		rcu_read_lock();
		idev = __in6_dev_get(dev);
		if (idev)
1199
			hoplimit = idev->cnf.hop_limit;
1200
		else
1201
			hoplimit = dev_net(dev)->ipv6.devconf_all->hop_limit;
1202
		rcu_read_unlock();
L
Linus Torvalds 已提交
1203 1204 1205
	}
	return hoplimit;
}
1206
EXPORT_SYMBOL(ip6_dst_hoplimit);
L
Linus Torvalds 已提交
1207 1208 1209 1210 1211

/*
 *
 */

1212
int ip6_route_add(struct fib6_config *cfg)
L
Linus Torvalds 已提交
1213 1214
{
	int err;
1215
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
1216 1217 1218
	struct rt6_info *rt = NULL;
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
1219
	struct fib6_table *table;
L
Linus Torvalds 已提交
1220 1221
	int addr_type;

1222
	if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
L
Linus Torvalds 已提交
1223 1224
		return -EINVAL;
#ifndef CONFIG_IPV6_SUBTREES
1225
	if (cfg->fc_src_len)
L
Linus Torvalds 已提交
1226 1227
		return -EINVAL;
#endif
1228
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
1229
		err = -ENODEV;
1230
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
1231 1232 1233 1234 1235 1236 1237
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

1238 1239
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
1240

1241
	err = -ENOBUFS;
1242 1243
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1244
		table = fib6_get_table(net, cfg->fc_table);
1245
		if (!table) {
1246 1247 1248 1249 1250 1251
			printk(KERN_WARNING "IPv6: NLM_F_CREATE should be specified when creating new route\n");
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
1252 1253

	if (!table)
T
Thomas Graf 已提交
1254 1255
		goto out;

1256
	rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, NULL, DST_NOCOUNT);
L
Linus Torvalds 已提交
1257

1258
	if (!rt) {
L
Linus Torvalds 已提交
1259 1260 1261 1262
		err = -ENOMEM;
		goto out;
	}

1263
	rt->dst.obsolete = -1;
1264 1265 1266
	rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
				jiffies + clock_t_to_jiffies(cfg->fc_expires) :
				0;
L
Linus Torvalds 已提交
1267

1268 1269 1270 1271 1272
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
	rt->rt6i_protocol = cfg->fc_protocol;

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
1273 1274

	if (addr_type & IPV6_ADDR_MULTICAST)
1275
		rt->dst.input = ip6_mc_input;
1276 1277
	else if (cfg->fc_flags & RTF_LOCAL)
		rt->dst.input = ip6_input;
L
Linus Torvalds 已提交
1278
	else
1279
		rt->dst.input = ip6_forward;
L
Linus Torvalds 已提交
1280

1281
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
1282

1283 1284
	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->rt6i_dst.plen = cfg->fc_dst_len;
L
Linus Torvalds 已提交
1285
	if (rt->rt6i_dst.plen == 128)
1286
	       rt->dst.flags |= DST_HOST;
L
Linus Torvalds 已提交
1287

1288 1289 1290 1291 1292 1293 1294 1295
	if (!(rt->dst.flags & DST_HOST) && cfg->fc_mx) {
		u32 *metrics = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
		if (!metrics) {
			err = -ENOMEM;
			goto out;
		}
		dst_init_metrics(&rt->dst, metrics, 0);
	}
L
Linus Torvalds 已提交
1296
#ifdef CONFIG_IPV6_SUBTREES
1297 1298
	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->rt6i_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
1299 1300
#endif

1301
	rt->rt6i_metric = cfg->fc_metric;
L
Linus Torvalds 已提交
1302 1303 1304 1305

	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
1306
	if ((cfg->fc_flags & RTF_REJECT) ||
1307 1308 1309
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
1310
		/* hold loopback dev/idev if we haven't done so. */
1311
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
1312 1313 1314 1315
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
1316
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
1317 1318 1319 1320 1321 1322 1323
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
1324 1325 1326
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		rt->dst.error = -ENETUNREACH;
L
Linus Torvalds 已提交
1327 1328 1329 1330
		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
		goto install_route;
	}

1331
	if (cfg->fc_flags & RTF_GATEWAY) {
1332
		const struct in6_addr *gw_addr;
L
Linus Torvalds 已提交
1333 1334
		int gwa_type;

1335
		gw_addr = &cfg->fc_gateway;
A
Alexey Dobriyan 已提交
1336
		rt->rt6i_gateway = *gw_addr;
L
Linus Torvalds 已提交
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
		gwa_type = ipv6_addr_type(gw_addr);

		if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
			struct rt6_info *grt;

			/* IPv6 strictly inhibits using not link-local
			   addresses as nexthop address.
			   Otherwise, router will not able to send redirects.
			   It is very good, but in some (rare!) circumstances
			   (SIT, PtP, NBMA NOARP links) it is handy to allow
			   some exceptions. --ANK
			 */
			err = -EINVAL;
1350
			if (!(gwa_type & IPV6_ADDR_UNICAST))
L
Linus Torvalds 已提交
1351 1352
				goto out;

1353
			grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
L
Linus Torvalds 已提交
1354 1355

			err = -EHOSTUNREACH;
1356
			if (!grt)
L
Linus Torvalds 已提交
1357 1358 1359
				goto out;
			if (dev) {
				if (dev != grt->rt6i_dev) {
1360
					dst_release(&grt->dst);
L
Linus Torvalds 已提交
1361 1362 1363 1364 1365 1366 1367 1368
					goto out;
				}
			} else {
				dev = grt->rt6i_dev;
				idev = grt->rt6i_idev;
				dev_hold(dev);
				in6_dev_hold(grt->rt6i_idev);
			}
1369
			if (!(grt->rt6i_flags & RTF_GATEWAY))
L
Linus Torvalds 已提交
1370
				err = 0;
1371
			dst_release(&grt->dst);
L
Linus Torvalds 已提交
1372 1373 1374 1375 1376

			if (err)
				goto out;
		}
		err = -EINVAL;
1377
		if (!dev || (dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
1378 1379 1380 1381
			goto out;
	}

	err = -ENODEV;
1382
	if (!dev)
L
Linus Torvalds 已提交
1383 1384
		goto out;

1385 1386 1387 1388 1389
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
			err = -EINVAL;
			goto out;
		}
A
Alexey Dobriyan 已提交
1390
		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
1391 1392 1393 1394
		rt->rt6i_prefsrc.plen = 128;
	} else
		rt->rt6i_prefsrc.plen = 0;

1395
	if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1396 1397 1398
		struct neighbour *n = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
		if (IS_ERR(n)) {
			err = PTR_ERR(n);
L
Linus Torvalds 已提交
1399 1400
			goto out;
		}
1401
		dst_set_neighbour(&rt->dst, n);
L
Linus Torvalds 已提交
1402 1403
	}

1404
	rt->rt6i_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
1405 1406

install_route:
1407 1408 1409 1410 1411
	if (cfg->fc_mx) {
		struct nlattr *nla;
		int remaining;

		nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1412
			int type = nla_type(nla);
1413 1414 1415

			if (type) {
				if (type > RTAX_MAX) {
L
Linus Torvalds 已提交
1416 1417 1418
					err = -EINVAL;
					goto out;
				}
1419

1420
				dst_metric_set(&rt->dst, type, nla_get_u32(nla));
L
Linus Torvalds 已提交
1421 1422 1423 1424
			}
		}
	}

1425
	rt->dst.dev = dev;
L
Linus Torvalds 已提交
1426
	rt->rt6i_idev = idev;
T
Thomas Graf 已提交
1427
	rt->rt6i_table = table;
1428

1429
	cfg->fc_nlinfo.nl_net = dev_net(dev);
1430

1431
	return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
L
Linus Torvalds 已提交
1432 1433 1434 1435 1436 1437 1438

out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);
	if (rt)
1439
		dst_free(&rt->dst);
L
Linus Torvalds 已提交
1440 1441 1442
	return err;
}

1443
static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
1444 1445
{
	int err;
T
Thomas Graf 已提交
1446
	struct fib6_table *table;
1447
	struct net *net = dev_net(rt->rt6i_dev);
L
Linus Torvalds 已提交
1448

1449
	if (rt == net->ipv6.ip6_null_entry)
1450 1451
		return -ENOENT;

T
Thomas Graf 已提交
1452 1453
	table = rt->rt6i_table;
	write_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1454

1455
	err = fib6_del(rt, info);
1456
	dst_release(&rt->dst);
L
Linus Torvalds 已提交
1457

T
Thomas Graf 已提交
1458
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1459 1460 1461 1462

	return err;
}

1463 1464
int ip6_del_rt(struct rt6_info *rt)
{
1465
	struct nl_info info = {
1466
		.nl_net = dev_net(rt->rt6i_dev),
1467
	};
1468
	return __ip6_del_rt(rt, &info);
1469 1470
}

1471
static int ip6_route_del(struct fib6_config *cfg)
L
Linus Torvalds 已提交
1472
{
T
Thomas Graf 已提交
1473
	struct fib6_table *table;
L
Linus Torvalds 已提交
1474 1475 1476 1477
	struct fib6_node *fn;
	struct rt6_info *rt;
	int err = -ESRCH;

1478
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1479
	if (!table)
T
Thomas Graf 已提交
1480 1481 1482
		return err;

	read_lock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1483

T
Thomas Graf 已提交
1484
	fn = fib6_locate(&table->tb6_root,
1485 1486
			 &cfg->fc_dst, cfg->fc_dst_len,
			 &cfg->fc_src, cfg->fc_src_len);
1487

L
Linus Torvalds 已提交
1488
	if (fn) {
1489
		for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1490
			if (cfg->fc_ifindex &&
1491
			    (!rt->rt6i_dev ||
1492
			     rt->rt6i_dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
1493
				continue;
1494 1495
			if (cfg->fc_flags & RTF_GATEWAY &&
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
L
Linus Torvalds 已提交
1496
				continue;
1497
			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
L
Linus Torvalds 已提交
1498
				continue;
1499
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
1500
			read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1501

1502
			return __ip6_del_rt(rt, &cfg->fc_nlinfo);
L
Linus Torvalds 已提交
1503 1504
		}
	}
T
Thomas Graf 已提交
1505
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1506 1507 1508 1509 1510 1511 1512

	return err;
}

/*
 *	Handle redirects
 */
1513
struct ip6rd_flowi {
1514
	struct flowi6 fl6;
1515 1516 1517
	struct in6_addr gateway;
};

1518 1519
static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
1520
					     struct flowi6 *fl6,
1521
					     int flags)
L
Linus Torvalds 已提交
1522
{
1523
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1524
	struct rt6_info *rt;
1525
	struct fib6_node *fn;
T
Thomas Graf 已提交
1526

L
Linus Torvalds 已提交
1527
	/*
1528 1529 1530 1531 1532 1533 1534 1535
	 * Get the "current" route for this destination and
	 * check if the redirect has come from approriate router.
	 *
	 * RFC 2461 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
L
Linus Torvalds 已提交
1536 1537
	 */

T
Thomas Graf 已提交
1538
	read_lock_bh(&table->tb6_lock);
1539
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1540
restart:
1541
	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553
		/*
		 * Current route is on-link; redirect is always invalid.
		 *
		 * Seems, previous statement is not true. It could
		 * be node, which looks for us as on-link (f.e. proxy ndisc)
		 * But then router serving it might decide, that we should
		 * know truth 8)8) --ANK (980726).
		 */
		if (rt6_check_expired(rt))
			continue;
		if (!(rt->rt6i_flags & RTF_GATEWAY))
			continue;
1554
		if (fl6->flowi6_oif != rt->rt6i_dev->ifindex)
1555
			continue;
1556
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1557 1558 1559
			continue;
		break;
	}
1560

1561
	if (!rt)
1562
		rt = net->ipv6.ip6_null_entry;
1563
	BACKTRACK(net, &fl6->saddr);
1564
out:
1565
	dst_hold(&rt->dst);
1566

T
Thomas Graf 已提交
1567
	read_unlock_bh(&table->tb6_lock);
1568

1569 1570 1571
	return rt;
};

1572 1573 1574
static struct rt6_info *ip6_route_redirect(const struct in6_addr *dest,
					   const struct in6_addr *src,
					   const struct in6_addr *gateway,
1575 1576
					   struct net_device *dev)
{
1577
	int flags = RT6_LOOKUP_F_HAS_SADDR;
1578
	struct net *net = dev_net(dev);
1579
	struct ip6rd_flowi rdfl = {
1580 1581 1582 1583
		.fl6 = {
			.flowi6_oif = dev->ifindex,
			.daddr = *dest,
			.saddr = *src,
1584 1585
		},
	};
1586

A
Alexey Dobriyan 已提交
1587
	rdfl.gateway = *gateway;
1588

1589 1590
	if (rt6_need_strict(dest))
		flags |= RT6_LOOKUP_F_IFACE;
1591

1592
	return (struct rt6_info *)fib6_rule_lookup(net, &rdfl.fl6,
1593
						   flags, __ip6_route_redirect);
1594 1595
}

1596 1597
void rt6_redirect(const struct in6_addr *dest, const struct in6_addr *src,
		  const struct in6_addr *saddr,
1598 1599 1600 1601
		  struct neighbour *neigh, u8 *lladdr, int on_link)
{
	struct rt6_info *rt, *nrt = NULL;
	struct netevent_redirect netevent;
1602
	struct net *net = dev_net(neigh->dev);
1603 1604 1605

	rt = ip6_route_redirect(dest, src, saddr, neigh->dev);

1606
	if (rt == net->ipv6.ip6_null_entry) {
L
Linus Torvalds 已提交
1607 1608 1609
		if (net_ratelimit())
			printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
			       "for redirect target\n");
1610
		goto out;
L
Linus Torvalds 已提交
1611 1612 1613 1614 1615 1616
	}

	/*
	 *	We have finally decided to accept it.
	 */

1617
	neigh_update(neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
				     NEIGH_UPDATE_F_ISROUTER))
		     );

	/*
	 * Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
1629
	dst_confirm(&rt->dst);
L
Linus Torvalds 已提交
1630 1631

	/* Duplicate redirect: silently ignore. */
1632
	if (neigh == dst_get_neighbour_raw(&rt->dst))
L
Linus Torvalds 已提交
1633 1634
		goto out;

E
Eric Dumazet 已提交
1635
	nrt = ip6_rt_copy(rt, dest);
1636
	if (!nrt)
L
Linus Torvalds 已提交
1637 1638 1639 1640 1641 1642
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
1643
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1644
	dst_set_neighbour(&nrt->dst, neigh_clone(neigh));
L
Linus Torvalds 已提交
1645

1646
	if (ip6_ins_rt(nrt))
L
Linus Torvalds 已提交
1647 1648
		goto out;

1649 1650
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
1651 1652
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

1653
	if (rt->rt6i_flags & RTF_CACHE) {
1654
		ip6_del_rt(rt);
L
Linus Torvalds 已提交
1655 1656 1657 1658
		return;
	}

out:
1659
	dst_release(&rt->dst);
L
Linus Torvalds 已提交
1660 1661 1662 1663 1664 1665 1666
}

/*
 *	Handle ICMP "packet too big" messages
 *	i.e. Path MTU discovery
 */

1667
static void rt6_do_pmtu_disc(const struct in6_addr *daddr, const struct in6_addr *saddr,
1668
			     struct net *net, u32 pmtu, int ifindex)
L
Linus Torvalds 已提交
1669 1670 1671
{
	struct rt6_info *rt, *nrt;
	int allfrag = 0;
1672
again:
1673
	rt = rt6_lookup(net, daddr, saddr, ifindex, 0);
1674
	if (!rt)
L
Linus Torvalds 已提交
1675 1676
		return;

1677 1678 1679 1680 1681
	if (rt6_check_expired(rt)) {
		ip6_del_rt(rt);
		goto again;
	}

1682
	if (pmtu >= dst_mtu(&rt->dst))
L
Linus Torvalds 已提交
1683 1684 1685 1686
		goto out;

	if (pmtu < IPV6_MIN_MTU) {
		/*
1687
		 * According to RFC2460, PMTU is set to the IPv6 Minimum Link
L
Linus Torvalds 已提交
1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699
		 * MTU (1280) and a fragment header should always be included
		 * after a node receiving Too Big message reporting PMTU is
		 * less than the IPv6 Minimum Link MTU.
		 */
		pmtu = IPV6_MIN_MTU;
		allfrag = 1;
	}

	/* New mtu received -> path was valid.
	   They are sent only in response to data packets,
	   so that this nexthop apparently is reachable. --ANK
	 */
1700
	dst_confirm(&rt->dst);
L
Linus Torvalds 已提交
1701 1702 1703 1704 1705 1706 1707

	/* Host route. If it is static, it would be better
	   not to override it, but add new one, so that
	   when cache entry will expire old pmtu
	   would return automatically.
	 */
	if (rt->rt6i_flags & RTF_CACHE) {
1708 1709 1710 1711 1712 1713
		dst_metric_set(&rt->dst, RTAX_MTU, pmtu);
		if (allfrag) {
			u32 features = dst_metric(&rt->dst, RTAX_FEATURES);
			features |= RTAX_FEATURE_ALLFRAG;
			dst_metric_set(&rt->dst, RTAX_FEATURES, features);
		}
1714
		dst_set_expires(&rt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
L
Linus Torvalds 已提交
1715 1716 1717 1718 1719 1720 1721 1722 1723
		rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
		goto out;
	}

	/* Network route.
	   Two cases are possible:
	   1. It is connected route. Action: COW
	   2. It is gatewayed route or NONEXTHOP route. Action: clone it.
	 */
1724
	if (!dst_get_neighbour_raw(&rt->dst) && !(rt->rt6i_flags & RTF_NONEXTHOP))
1725
		nrt = rt6_alloc_cow(rt, daddr, saddr);
1726 1727
	else
		nrt = rt6_alloc_clone(rt, daddr);
1728

1729
	if (nrt) {
1730 1731 1732 1733 1734 1735
		dst_metric_set(&nrt->dst, RTAX_MTU, pmtu);
		if (allfrag) {
			u32 features = dst_metric(&nrt->dst, RTAX_FEATURES);
			features |= RTAX_FEATURE_ALLFRAG;
			dst_metric_set(&nrt->dst, RTAX_FEATURES, features);
		}
1736 1737 1738 1739 1740 1741 1742

		/* According to RFC 1981, detecting PMTU increase shouldn't be
		 * happened within 5 mins, the recommended timer is 10 mins.
		 * Here this route expiration time is set to ip6_rt_mtu_expires
		 * which is 10 mins. After 10 mins the decreased pmtu is expired
		 * and detecting PMTU increase will be automatically happened.
		 */
1743
		dst_set_expires(&nrt->dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1744 1745
		nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;

1746
		ip6_ins_rt(nrt);
L
Linus Torvalds 已提交
1747 1748
	}
out:
1749
	dst_release(&rt->dst);
L
Linus Torvalds 已提交
1750 1751
}

1752
void rt6_pmtu_discovery(const struct in6_addr *daddr, const struct in6_addr *saddr,
1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772
			struct net_device *dev, u32 pmtu)
{
	struct net *net = dev_net(dev);

	/*
	 * RFC 1981 states that a node "MUST reduce the size of the packets it
	 * is sending along the path" that caused the Packet Too Big message.
	 * Since it's not possible in the general case to determine which
	 * interface was used to send the original packet, we update the MTU
	 * on the interface that will be used to send future packets. We also
	 * update the MTU on the interface that received the Packet Too Big in
	 * case the original packet was forced out that interface with
	 * SO_BINDTODEVICE or similar. This is the next best thing to the
	 * correct behaviour, which would be to update the MTU on all
	 * interfaces.
	 */
	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, 0);
	rt6_do_pmtu_disc(daddr, saddr, net, pmtu, dev->ifindex);
}

L
Linus Torvalds 已提交
1773 1774 1775 1776
/*
 *	Misc support functions
 */

E
Eric Dumazet 已提交
1777 1778
static struct rt6_info *ip6_rt_copy(const struct rt6_info *ort,
				    const struct in6_addr *dest)
L
Linus Torvalds 已提交
1779
{
1780
	struct net *net = dev_net(ort->rt6i_dev);
1781
	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
1782
					    ort->dst.dev, 0);
L
Linus Torvalds 已提交
1783 1784

	if (rt) {
1785 1786
		rt->dst.input = ort->dst.input;
		rt->dst.output = ort->dst.output;
1787
		rt->dst.flags |= DST_HOST;
1788

A
Alexey Dobriyan 已提交
1789
		rt->rt6i_dst.addr = *dest;
1790
		rt->rt6i_dst.plen = 128;
1791
		dst_copy_metrics(&rt->dst, &ort->dst);
1792
		rt->dst.error = ort->dst.error;
L
Linus Torvalds 已提交
1793 1794 1795
		rt->rt6i_idev = ort->rt6i_idev;
		if (rt->rt6i_idev)
			in6_dev_hold(rt->rt6i_idev);
1796
		rt->dst.lastuse = jiffies;
L
Linus Torvalds 已提交
1797 1798
		rt->rt6i_expires = 0;

A
Alexey Dobriyan 已提交
1799
		rt->rt6i_gateway = ort->rt6i_gateway;
L
Linus Torvalds 已提交
1800 1801 1802 1803 1804 1805
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
		rt->rt6i_metric = 0;

#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
1806
		memcpy(&rt->rt6i_prefsrc, &ort->rt6i_prefsrc, sizeof(struct rt6key));
T
Thomas Graf 已提交
1807
		rt->rt6i_table = ort->rt6i_table;
L
Linus Torvalds 已提交
1808 1809 1810 1811
	}
	return rt;
}

1812
#ifdef CONFIG_IPV6_ROUTE_INFO
1813
static struct rt6_info *rt6_get_route_info(struct net *net,
1814 1815
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex)
1816 1817 1818
{
	struct fib6_node *fn;
	struct rt6_info *rt = NULL;
T
Thomas Graf 已提交
1819 1820
	struct fib6_table *table;

1821
	table = fib6_get_table(net, RT6_TABLE_INFO);
1822
	if (!table)
T
Thomas Graf 已提交
1823
		return NULL;
1824

T
Thomas Graf 已提交
1825 1826
	write_lock_bh(&table->tb6_lock);
	fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1827 1828 1829
	if (!fn)
		goto out;

1830
	for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1831 1832 1833 1834 1835 1836
		if (rt->rt6i_dev->ifindex != ifindex)
			continue;
		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
			continue;
		if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
			continue;
1837
		dst_hold(&rt->dst);
1838 1839 1840
		break;
	}
out:
T
Thomas Graf 已提交
1841
	write_unlock_bh(&table->tb6_lock);
1842 1843 1844
	return rt;
}

1845
static struct rt6_info *rt6_add_route_info(struct net *net,
1846 1847
					   const struct in6_addr *prefix, int prefixlen,
					   const struct in6_addr *gwaddr, int ifindex,
1848 1849
					   unsigned pref)
{
1850 1851
	struct fib6_config cfg = {
		.fc_table	= RT6_TABLE_INFO,
1852
		.fc_metric	= IP6_RT_PRIO_USER,
1853 1854 1855 1856
		.fc_ifindex	= ifindex,
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
1857 1858 1859
		.fc_nlinfo.pid = 0,
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
1860 1861
	};

A
Alexey Dobriyan 已提交
1862 1863
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
1864

1865 1866
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
1867
		cfg.fc_flags |= RTF_DEFAULT;
1868

1869
	ip6_route_add(&cfg);
1870

1871
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1872 1873 1874
}
#endif

1875
struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1876
{
L
Linus Torvalds 已提交
1877
	struct rt6_info *rt;
T
Thomas Graf 已提交
1878
	struct fib6_table *table;
L
Linus Torvalds 已提交
1879

1880
	table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1881
	if (!table)
T
Thomas Graf 已提交
1882
		return NULL;
L
Linus Torvalds 已提交
1883

T
Thomas Graf 已提交
1884
	write_lock_bh(&table->tb6_lock);
1885
	for (rt = table->tb6_root.leaf; rt; rt=rt->dst.rt6_next) {
L
Linus Torvalds 已提交
1886
		if (dev == rt->rt6i_dev &&
1887
		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
L
Linus Torvalds 已提交
1888 1889 1890 1891
		    ipv6_addr_equal(&rt->rt6i_gateway, addr))
			break;
	}
	if (rt)
1892
		dst_hold(&rt->dst);
T
Thomas Graf 已提交
1893
	write_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1894 1895 1896
	return rt;
}

1897
struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
1898 1899
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
1900
{
1901 1902
	struct fib6_config cfg = {
		.fc_table	= RT6_TABLE_DFLT,
1903
		.fc_metric	= IP6_RT_PRIO_USER,
1904 1905 1906
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1907 1908
		.fc_nlinfo.pid = 0,
		.fc_nlinfo.nlh = NULL,
1909
		.fc_nlinfo.nl_net = dev_net(dev),
1910
	};
L
Linus Torvalds 已提交
1911

A
Alexey Dobriyan 已提交
1912
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
1913

1914
	ip6_route_add(&cfg);
L
Linus Torvalds 已提交
1915 1916 1917 1918

	return rt6_get_dflt_router(gwaddr, dev);
}

1919
void rt6_purge_dflt_routers(struct net *net)
L
Linus Torvalds 已提交
1920 1921
{
	struct rt6_info *rt;
T
Thomas Graf 已提交
1922 1923 1924
	struct fib6_table *table;

	/* NOTE: Keep consistent with rt6_get_dflt_router */
1925
	table = fib6_get_table(net, RT6_TABLE_DFLT);
1926
	if (!table)
T
Thomas Graf 已提交
1927
		return;
L
Linus Torvalds 已提交
1928 1929

restart:
T
Thomas Graf 已提交
1930
	read_lock_bh(&table->tb6_lock);
1931
	for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
L
Linus Torvalds 已提交
1932
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1933
			dst_hold(&rt->dst);
T
Thomas Graf 已提交
1934
			read_unlock_bh(&table->tb6_lock);
1935
			ip6_del_rt(rt);
L
Linus Torvalds 已提交
1936 1937 1938
			goto restart;
		}
	}
T
Thomas Graf 已提交
1939
	read_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1940 1941
}

1942 1943
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = RT6_TABLE_MAIN;
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;

1956
	cfg->fc_nlinfo.nl_net = net;
1957

A
Alexey Dobriyan 已提交
1958 1959 1960
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
1961 1962
}

1963
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
1964
{
1965
	struct fib6_config cfg;
L
Linus Torvalds 已提交
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
	struct in6_rtmsg rtmsg;
	int err;

	switch(cmd) {
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
		if (!capable(CAP_NET_ADMIN))
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
1978

1979
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1980

L
Linus Torvalds 已提交
1981 1982 1983
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
1984
			err = ip6_route_add(&cfg);
L
Linus Torvalds 已提交
1985 1986
			break;
		case SIOCDELRT:
1987
			err = ip6_route_del(&cfg);
L
Linus Torvalds 已提交
1988 1989 1990 1991 1992 1993 1994
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
1995
	}
L
Linus Torvalds 已提交
1996 1997 1998 1999 2000 2001 2002 2003

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

2004
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
2005
{
2006
	int type;
E
Eric Dumazet 已提交
2007
	struct dst_entry *dst = skb_dst(skb);
2008 2009
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
2010
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
2011
		if (type == IPV6_ADDR_ANY) {
2012 2013
			IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
				      IPSTATS_MIB_INADDRERRORS);
2014 2015 2016 2017
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
2018 2019
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
2020 2021
		break;
	}
2022
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
2023 2024 2025 2026
	kfree_skb(skb);
	return 0;
}

2027 2028
static int ip6_pkt_discard(struct sk_buff *skb)
{
2029
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2030 2031
}

2032
static int ip6_pkt_discard_out(struct sk_buff *skb)
L
Linus Torvalds 已提交
2033
{
E
Eric Dumazet 已提交
2034
	skb->dev = skb_dst(skb)->dev;
2035
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
2036 2037
}

2038 2039
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

2040 2041
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
2042
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2043 2044 2045 2046
}

static int ip6_pkt_prohibit_out(struct sk_buff *skb)
{
E
Eric Dumazet 已提交
2047
	skb->dev = skb_dst(skb)->dev;
2048
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2049 2050
}

2051 2052
#endif

L
Linus Torvalds 已提交
2053 2054 2055 2056 2057 2058 2059 2060
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
				    const struct in6_addr *addr,
				    int anycast)
{
2061
	struct net *net = dev_net(idev->dev);
2062
	struct rt6_info *rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops,
2063
					    net->loopback_dev, 0);
2064
	struct neighbour *neigh;
L
Linus Torvalds 已提交
2065

2066
	if (!rt) {
2067 2068 2069
		if (net_ratelimit())
			pr_warning("IPv6:  Maximum number of routes reached,"
				   " consider increasing route/max_size.\n");
L
Linus Torvalds 已提交
2070
		return ERR_PTR(-ENOMEM);
2071
	}
L
Linus Torvalds 已提交
2072 2073 2074

	in6_dev_hold(idev);

2075
	rt->dst.flags |= DST_HOST;
2076 2077
	rt->dst.input = ip6_input;
	rt->dst.output = ip6_output;
L
Linus Torvalds 已提交
2078
	rt->rt6i_idev = idev;
2079
	rt->dst.obsolete = -1;
L
Linus Torvalds 已提交
2080 2081

	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2082 2083 2084
	if (anycast)
		rt->rt6i_flags |= RTF_ANYCAST;
	else
L
Linus Torvalds 已提交
2085
		rt->rt6i_flags |= RTF_LOCAL;
2086
	neigh = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, rt->rt6i_dev);
2087
	if (IS_ERR(neigh)) {
2088
		dst_free(&rt->dst);
2089

2090
		return ERR_CAST(neigh);
L
Linus Torvalds 已提交
2091
	}
2092
	dst_set_neighbour(&rt->dst, neigh);
L
Linus Torvalds 已提交
2093

A
Alexey Dobriyan 已提交
2094
	rt->rt6i_dst.addr = *addr;
L
Linus Torvalds 已提交
2095
	rt->rt6i_dst.plen = 128;
2096
	rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
L
Linus Torvalds 已提交
2097

2098
	atomic_set(&rt->dst.__refcnt, 1);
L
Linus Torvalds 已提交
2099 2100 2101 2102

	return rt;
}

2103 2104
int ip6_route_get_saddr(struct net *net,
			struct rt6_info *rt,
2105
			const struct in6_addr *daddr,
2106 2107 2108 2109 2110 2111
			unsigned int prefs,
			struct in6_addr *saddr)
{
	struct inet6_dev *idev = ip6_dst_idev((struct dst_entry*)rt);
	int err = 0;
	if (rt->rt6i_prefsrc.plen)
A
Alexey Dobriyan 已提交
2112
		*saddr = rt->rt6i_prefsrc.addr;
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131
	else
		err = ipv6_dev_get_saddr(net, idev ? idev->dev : NULL,
					 daddr, prefs, saddr);
	return err;
}

/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

2132
	if (((void *)rt->rt6i_dev == dev || !dev) &&
2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151
	    rt != net->ipv6.ip6_null_entry &&
	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
		/* remove prefsrc entry */
		rt->rt6i_prefsrc.plen = 0;
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
	fib6_clean_all(net, fib6_remove_prefsrc, 0, &adni);
}

2152 2153 2154 2155 2156
struct arg_dev_net {
	struct net_device *dev;
	struct net *net;
};

L
Linus Torvalds 已提交
2157 2158
static int fib6_ifdown(struct rt6_info *rt, void *arg)
{
S
stephen hemminger 已提交
2159 2160
	const struct arg_dev_net *adn = arg;
	const struct net_device *dev = adn->dev;
2161

2162
	if ((rt->rt6i_dev == dev || !dev) &&
S
stephen hemminger 已提交
2163
	    rt != adn->net->ipv6.ip6_null_entry) {
L
Linus Torvalds 已提交
2164 2165 2166 2167 2168 2169
		RT6_TRACE("deleted by ifdown %p\n", rt);
		return -1;
	}
	return 0;
}

2170
void rt6_ifdown(struct net *net, struct net_device *dev)
L
Linus Torvalds 已提交
2171
{
2172 2173 2174 2175 2176 2177
	struct arg_dev_net adn = {
		.dev = dev,
		.net = net,
	};

	fib6_clean_all(net, fib6_ifdown, 0, &adn);
2178
	icmp6_clean_all(fib6_ifdown, &adn);
L
Linus Torvalds 已提交
2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
}

struct rt6_mtu_change_arg
{
	struct net_device *dev;
	unsigned mtu;
};

static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
2199
	if (!idev)
L
Linus Torvalds 已提交
2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
	/*
	   If new MTU is less than route PMTU, this new MTU will be the
	   lowest MTU in the path, update the route PMTU to reflect PMTU
	   decreases; if new MTU is greater than route PMTU, and the
	   old MTU is the lowest MTU in the path, update the route PMTU
	   to reflect the increase. In this case if the other nodes' MTU
	   also have the lowest MTU, TOO BIG MESSAGE will be lead to
	   PMTU discouvery.
	 */
	if (rt->rt6i_dev == arg->dev &&
2217 2218 2219 2220
	    !dst_metric_locked(&rt->dst, RTAX_MTU) &&
	    (dst_mtu(&rt->dst) >= arg->mtu ||
	     (dst_mtu(&rt->dst) < arg->mtu &&
	      dst_mtu(&rt->dst) == idev->cnf.mtu6))) {
2221
		dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2222
	}
L
Linus Torvalds 已提交
2223 2224 2225 2226 2227
	return 0;
}

void rt6_mtu_change(struct net_device *dev, unsigned mtu)
{
T
Thomas Graf 已提交
2228 2229 2230 2231
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
2232

2233
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
L
Linus Torvalds 已提交
2234 2235
}

2236
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2237
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2238
	[RTA_OIF]               = { .type = NLA_U32 },
2239
	[RTA_IIF]		= { .type = NLA_U32 },
2240 2241 2242 2243 2244 2245
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct fib6_config *cfg)
L
Linus Torvalds 已提交
2246
{
2247 2248 2249
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
	int err;
L
Linus Torvalds 已提交
2250

2251 2252 2253
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
2254

2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;

	if (rtm->rtm_type == RTN_UNREACHABLE)
		cfg->fc_flags |= RTF_REJECT;

2268 2269 2270
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

2271 2272
	cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
	cfg->fc_nlinfo.nlh = nlh;
2273
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2274 2275 2276 2277

	if (tb[RTA_GATEWAY]) {
		nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
2278
	}
2279 2280 2281 2282 2283 2284 2285 2286

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
2287
	}
2288 2289 2290 2291 2292 2293 2294 2295

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
2296
	}
2297

2298 2299 2300
	if (tb[RTA_PREFSRC])
		nla_memcpy(&cfg->fc_prefsrc, tb[RTA_PREFSRC], 16);

2301 2302 2303 2304 2305 2306 2307 2308 2309
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
2310
	}
2311 2312 2313 2314 2315 2316 2317

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
2318 2319
}

2320
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2321
{
2322 2323
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
2324

2325 2326 2327 2328 2329
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

	return ip6_route_del(&cfg);
L
Linus Torvalds 已提交
2330 2331
}

2332
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2333
{
2334 2335
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
2336

2337 2338 2339 2340 2341
	err = rtm_to_fib6_config(skb, nlh, &cfg);
	if (err < 0)
		return err;

	return ip6_route_add(&cfg);
L
Linus Torvalds 已提交
2342 2343
}

2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354
static inline size_t rt6_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
2355
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2356 2357 2358
	       + nla_total_size(sizeof(struct rta_cacheinfo));
}

2359 2360
static int rt6_fill_node(struct net *net,
			 struct sk_buff *skb, struct rt6_info *rt,
2361 2362
			 struct in6_addr *dst, struct in6_addr *src,
			 int iif, int type, u32 pid, u32 seq,
2363
			 int prefix, int nowait, unsigned int flags)
L
Linus Torvalds 已提交
2364 2365
{
	struct rtmsg *rtm;
2366
	struct nlmsghdr *nlh;
2367
	long expires;
2368
	u32 table;
2369
	struct neighbour *n;
L
Linus Torvalds 已提交
2370 2371 2372 2373 2374 2375 2376 2377

	if (prefix) {	/* user wants prefix routes only */
		if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
			/* success since this is not a prefix route */
			return 1;
		}
	}

2378
	nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2379
	if (!nlh)
2380
		return -EMSGSIZE;
2381 2382

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
2383 2384 2385 2386
	rtm->rtm_family = AF_INET6;
	rtm->rtm_dst_len = rt->rt6i_dst.plen;
	rtm->rtm_src_len = rt->rt6i_src.plen;
	rtm->rtm_tos = 0;
T
Thomas Graf 已提交
2387
	if (rt->rt6i_table)
2388
		table = rt->rt6i_table->tb6_id;
T
Thomas Graf 已提交
2389
	else
2390 2391
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
2392
	NLA_PUT_U32(skb, RTA_TABLE, table);
2393
	if (rt->rt6i_flags & RTF_REJECT)
L
Linus Torvalds 已提交
2394
		rtm->rtm_type = RTN_UNREACHABLE;
2395
	else if (rt->rt6i_flags & RTF_LOCAL)
2396
		rtm->rtm_type = RTN_LOCAL;
2397
	else if (rt->rt6i_dev && (rt->rt6i_dev->flags & IFF_LOOPBACK))
L
Linus Torvalds 已提交
2398 2399 2400 2401 2402 2403
		rtm->rtm_type = RTN_LOCAL;
	else
		rtm->rtm_type = RTN_UNICAST;
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
	rtm->rtm_protocol = rt->rt6i_protocol;
2404
	if (rt->rt6i_flags & RTF_DYNAMIC)
L
Linus Torvalds 已提交
2405 2406 2407
		rtm->rtm_protocol = RTPROT_REDIRECT;
	else if (rt->rt6i_flags & RTF_ADDRCONF)
		rtm->rtm_protocol = RTPROT_KERNEL;
2408
	else if (rt->rt6i_flags & RTF_DEFAULT)
L
Linus Torvalds 已提交
2409 2410
		rtm->rtm_protocol = RTPROT_RA;

2411
	if (rt->rt6i_flags & RTF_CACHE)
L
Linus Torvalds 已提交
2412 2413 2414
		rtm->rtm_flags |= RTM_F_CLONED;

	if (dst) {
2415
		NLA_PUT(skb, RTA_DST, 16, dst);
2416
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
2417
	} else if (rtm->rtm_dst_len)
2418
		NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
L
Linus Torvalds 已提交
2419 2420
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
2421
		NLA_PUT(skb, RTA_SRC, 16, src);
2422
		rtm->rtm_src_len = 128;
L
Linus Torvalds 已提交
2423
	} else if (rtm->rtm_src_len)
2424
		NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
L
Linus Torvalds 已提交
2425
#endif
2426 2427 2428
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2429
			int err = ip6mr_get_route(net, skb, rtm, nowait);
2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443
			if (err <= 0) {
				if (!nowait) {
					if (err == 0)
						return 0;
					goto nla_put_failure;
				} else {
					if (err == -EMSGSIZE)
						goto nla_put_failure;
				}
			}
		} else
#endif
			NLA_PUT_U32(skb, RTA_IIF, iif);
	} else if (dst) {
L
Linus Torvalds 已提交
2444
		struct in6_addr saddr_buf;
2445
		if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0)
2446
			NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
L
Linus Torvalds 已提交
2447
	}
2448

2449 2450
	if (rt->rt6i_prefsrc.plen) {
		struct in6_addr saddr_buf;
A
Alexey Dobriyan 已提交
2451
		saddr_buf = rt->rt6i_prefsrc.addr;
2452 2453 2454
		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
	}

2455
	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
2456 2457
		goto nla_put_failure;

2458 2459 2460 2461 2462
	rcu_read_lock();
	n = dst_get_neighbour(&rt->dst);
	if (n)
		NLA_PUT(skb, RTA_GATEWAY, 16, &n->primary_key);
	rcu_read_unlock();
2463

2464
	if (rt->dst.dev)
2465 2466 2467
		NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);

	NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2468

2469 2470 2471 2472 2473 2474
	if (!(rt->rt6i_flags & RTF_EXPIRES))
		expires = 0;
	else if (rt->rt6i_expires - jiffies < INT_MAX)
		expires = rt->rt6i_expires - jiffies;
	else
		expires = INT_MAX;
2475

2476 2477
	if (rtnl_put_cacheinfo(skb, &rt->dst, 0, 0, 0,
			       expires, rt->dst.error) < 0)
2478
		goto nla_put_failure;
2479 2480 2481 2482

	return nlmsg_end(skb, nlh);

nla_put_failure:
2483 2484
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
2485 2486
}

2487
int rt6_dump_route(struct rt6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
2488 2489 2490 2491
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
	int prefix;

2492 2493
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
L
Linus Torvalds 已提交
2494 2495 2496 2497
		prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
	} else
		prefix = 0;

2498 2499
	return rt6_fill_node(arg->net,
		     arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
L
Linus Torvalds 已提交
2500
		     NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2501
		     prefix, 0, NLM_F_MULTI);
L
Linus Torvalds 已提交
2502 2503
}

2504
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
L
Linus Torvalds 已提交
2505
{
2506
	struct net *net = sock_net(in_skb->sk);
2507 2508
	struct nlattr *tb[RTA_MAX+1];
	struct rt6_info *rt;
L
Linus Torvalds 已提交
2509
	struct sk_buff *skb;
2510
	struct rtmsg *rtm;
2511
	struct flowi6 fl6;
2512
	int err, iif = 0;
L
Linus Torvalds 已提交
2513

2514 2515 2516
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
2517

2518
	err = -EINVAL;
2519
	memset(&fl6, 0, sizeof(fl6));
L
Linus Torvalds 已提交
2520

2521 2522 2523 2524
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
2525
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
2526 2527 2528 2529 2530 2531
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
2532
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
2533 2534 2535 2536 2537 2538
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
2539
		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
2540 2541 2542

	if (iif) {
		struct net_device *dev;
2543
		dev = __dev_get_by_index(net, iif);
L
Linus Torvalds 已提交
2544 2545
		if (!dev) {
			err = -ENODEV;
2546
			goto errout;
L
Linus Torvalds 已提交
2547 2548 2549
		}
	}

2550
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2551
	if (!skb) {
2552 2553 2554
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
2555

2556 2557 2558
	/* Reserve room for dummy headers, this skb can pass
	   through good chunk of routing engine.
	 */
2559
	skb_reset_mac_header(skb);
2560
	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
L
Linus Torvalds 已提交
2561

2562
	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
2563
	skb_dst_set(skb, &rt->dst);
L
Linus Torvalds 已提交
2564

2565
	err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
L
Linus Torvalds 已提交
2566
			    RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2567
			    nlh->nlmsg_seq, 0, 0, 0);
L
Linus Torvalds 已提交
2568
	if (err < 0) {
2569 2570
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
2571 2572
	}

2573
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2574
errout:
L
Linus Torvalds 已提交
2575 2576 2577
	return err;
}

2578
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
2579 2580
{
	struct sk_buff *skb;
2581
	struct net *net = info->nl_net;
2582 2583 2584 2585
	u32 seq;
	int err;

	err = -ENOBUFS;
2586
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2587

2588
	skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2589
	if (!skb)
2590 2591
		goto errout;

2592
	err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
2593
				event, info->pid, seq, 0, 0, 0);
2594 2595 2596 2597 2598 2599
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
2600 2601 2602
	rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
		    info->nlh, gfp_any());
	return;
2603 2604
errout:
	if (err < 0)
2605
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
2606 2607
}

2608 2609 2610 2611
static int ip6_route_dev_notify(struct notifier_block *this,
				unsigned long event, void *data)
{
	struct net_device *dev = (struct net_device *)data;
2612
	struct net *net = dev_net(dev);
2613 2614

	if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2615
		net->ipv6.ip6_null_entry->dst.dev = dev;
2616 2617
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
2618
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
2619
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2620
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
2621 2622 2623 2624 2625 2626 2627
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS

struct rt6_proc_arg
{
	char *buffer;
	int offset;
	int length;
	int skip;
	int len;
};

static int rt6_info_route(struct rt6_info *rt, void *p_arg)
{
2645
	struct seq_file *m = p_arg;
2646
	struct neighbour *n;
L
Linus Torvalds 已提交
2647

2648
	seq_printf(m, "%pi6 %02x ", &rt->rt6i_dst.addr, rt->rt6i_dst.plen);
L
Linus Torvalds 已提交
2649 2650

#ifdef CONFIG_IPV6_SUBTREES
2651
	seq_printf(m, "%pi6 %02x ", &rt->rt6i_src.addr, rt->rt6i_src.plen);
L
Linus Torvalds 已提交
2652
#else
2653
	seq_puts(m, "00000000000000000000000000000000 00 ");
L
Linus Torvalds 已提交
2654
#endif
2655
	rcu_read_lock();
2656 2657 2658
	n = dst_get_neighbour(&rt->dst);
	if (n) {
		seq_printf(m, "%pi6", n->primary_key);
L
Linus Torvalds 已提交
2659
	} else {
2660
		seq_puts(m, "00000000000000000000000000000000");
L
Linus Torvalds 已提交
2661
	}
2662
	rcu_read_unlock();
2663
	seq_printf(m, " %08x %08x %08x %08x %8s\n",
2664 2665
		   rt->rt6i_metric, atomic_read(&rt->dst.__refcnt),
		   rt->dst.__use, rt->rt6i_flags,
2666
		   rt->rt6i_dev ? rt->rt6i_dev->name : "");
L
Linus Torvalds 已提交
2667 2668 2669
	return 0;
}

2670
static int ipv6_route_show(struct seq_file *m, void *v)
L
Linus Torvalds 已提交
2671
{
2672 2673
	struct net *net = (struct net *)m->private;
	fib6_clean_all(net, rt6_info_route, 0, m);
2674 2675
	return 0;
}
L
Linus Torvalds 已提交
2676

2677 2678
static int ipv6_route_open(struct inode *inode, struct file *file)
{
2679
	return single_open_net(inode, file, ipv6_route_show);
2680 2681
}

2682 2683 2684 2685 2686
static const struct file_operations ipv6_route_proc_fops = {
	.owner		= THIS_MODULE,
	.open		= ipv6_route_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
2687
	.release	= single_release_net,
2688 2689
};

L
Linus Torvalds 已提交
2690 2691
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
2692
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
2693
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2694 2695 2696 2697 2698
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
		   net->ipv6.rt6_stats->fib_rt_alloc,
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
2699
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
2700
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
2701 2702 2703 2704 2705 2706

	return 0;
}

static int rt6_stats_seq_open(struct inode *inode, struct file *file)
{
2707
	return single_open_net(inode, file, rt6_stats_seq_show);
2708 2709
}

2710
static const struct file_operations rt6_stats_seq_fops = {
L
Linus Torvalds 已提交
2711 2712 2713 2714
	.owner	 = THIS_MODULE,
	.open	 = rt6_stats_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
2715
	.release = single_release_net,
L
Linus Torvalds 已提交
2716 2717 2718 2719 2720 2721
};
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
2722
int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write,
L
Linus Torvalds 已提交
2723 2724
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
2725 2726 2727
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
2728
		return -EINVAL;
2729 2730 2731 2732 2733 2734

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
	fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
	return 0;
L
Linus Torvalds 已提交
2735 2736
}

2737
ctl_table ipv6_route_table_template[] = {
2738
	{
L
Linus Torvalds 已提交
2739
		.procname	=	"flush",
2740
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
2741
		.maxlen		=	sizeof(int),
2742
		.mode		=	0200,
A
Alexey Dobriyan 已提交
2743
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
2744 2745 2746
	},
	{
		.procname	=	"gc_thresh",
2747
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
2748 2749
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2750
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2751 2752 2753
	},
	{
		.procname	=	"max_size",
2754
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
2755 2756
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2757
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2758 2759 2760
	},
	{
		.procname	=	"gc_min_interval",
2761
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
2762 2763
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2764
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2765 2766 2767
	},
	{
		.procname	=	"gc_timeout",
2768
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
2769 2770
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2771
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2772 2773 2774
	},
	{
		.procname	=	"gc_interval",
2775
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
2776 2777
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2778
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2779 2780 2781
	},
	{
		.procname	=	"gc_elasticity",
2782
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
2783 2784
		.maxlen		=	sizeof(int),
		.mode		=	0644,
2785
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2786 2787 2788
	},
	{
		.procname	=	"mtu_expires",
2789
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
2790 2791
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2792
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
2793 2794 2795
	},
	{
		.procname	=	"min_adv_mss",
2796
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
2797 2798
		.maxlen		=	sizeof(int),
		.mode		=	0644,
2799
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
2800 2801 2802
	},
	{
		.procname	=	"gc_min_interval_ms",
2803
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
2804 2805
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
2806
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
2807
	},
2808
	{ }
L
Linus Torvalds 已提交
2809 2810
};

2811
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
2812 2813 2814 2815 2816 2817
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
2818 2819 2820

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
2821
		table[0].extra1 = net;
2822
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
2823 2824 2825 2826 2827 2828 2829
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2830
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2831 2832
	}

2833 2834
	return table;
}
L
Linus Torvalds 已提交
2835 2836
#endif

2837
static int __net_init ip6_route_net_init(struct net *net)
2838
{
2839
	int ret = -ENOMEM;
2840

2841 2842
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
2843

2844 2845 2846
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

2847 2848 2849 2850
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
2851
		goto out_ip6_dst_entries;
2852
	net->ipv6.ip6_null_entry->dst.path =
2853
		(struct dst_entry *)net->ipv6.ip6_null_entry;
2854
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2855 2856
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
2857 2858 2859 2860 2861

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
2862 2863
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
2864
	net->ipv6.ip6_prohibit_entry->dst.path =
2865
		(struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2866
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2867 2868
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
2869 2870 2871 2872

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
2873 2874
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
2875
	net->ipv6.ip6_blk_hole_entry->dst.path =
2876
		(struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2877
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
2878 2879
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
2880 2881
#endif

2882 2883 2884 2885 2886 2887 2888 2889 2890
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

2891 2892 2893 2894
#ifdef CONFIG_PROC_FS
	proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
	proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
#endif
2895 2896
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

2897 2898 2899
	ret = 0;
out:
	return ret;
2900

2901 2902 2903 2904 2905 2906
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
2907 2908
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2909 2910
out_ip6_dst_ops:
	goto out;
2911 2912
}

2913
static void __net_exit ip6_route_net_exit(struct net *net)
2914 2915 2916 2917 2918
{
#ifdef CONFIG_PROC_FS
	proc_net_remove(net, "ipv6_route");
	proc_net_remove(net, "rt6_stats");
#endif
2919 2920 2921 2922 2923
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
2924
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
2925 2926 2927 2928 2929 2930 2931
}

static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

2932 2933 2934 2935 2936
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
	.priority = 0,
};

2937
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
2938
{
2939 2940
	int ret;

2941 2942
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
2943
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2944
				  SLAB_HWCACHE_ALIGN, NULL);
2945
	if (!ip6_dst_ops_template.kmem_cachep)
2946
		goto out;
2947

2948
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
2949
	if (ret)
2950 2951
		goto out_kmem_cache;

2952 2953 2954 2955
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_dst_entries;

2956 2957
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

2958 2959 2960
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
2961
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
2962 2963
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2964
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
2965
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2966
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
2967 2968
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
2969 2970
	ret = fib6_init();
	if (ret)
2971
		goto out_register_subsys;
2972 2973 2974

	ret = xfrm6_init();
	if (ret)
2975
		goto out_fib6_init;
2976

2977 2978 2979
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
2980

2981
	ret = -ENOBUFS;
2982 2983 2984
	if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
	    __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
2985
		goto fib6_rules_init;
2986

2987
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2988 2989
	if (ret)
		goto fib6_rules_init;
2990

2991 2992 2993 2994 2995 2996 2997 2998 2999
out:
	return ret;

fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
out_fib6_init:
	fib6_gc_cleanup();
3000 3001
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
3002 3003
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
3004
out_kmem_cache:
3005
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3006
	goto out;
L
Linus Torvalds 已提交
3007 3008 3009 3010
}

void ip6_route_cleanup(void)
{
3011
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
T
Thomas Graf 已提交
3012
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
3013 3014
	xfrm6_fini();
	fib6_gc_cleanup();
3015
	unregister_pernet_subsys(&ip6_route_net_ops);
3016
	dst_entries_destroy(&ip6_dst_blackhole_ops);
3017
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
3018
}