ip6_tunnel.c 36.7 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 *	IPv6 tunneling device
L
Linus Torvalds 已提交
3 4 5
 *	Linux INET6 implementation
 *
 *	Authors:
6
 *	Ville Nuorvala		<vnuorval@tcs.hut.fi>
7
 *	Yasuyuki Kozakai	<kozakai@linux-ipv6.org>
L
Linus Torvalds 已提交
8 9
 *
 *      Based on:
10
 *      linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20 21
 *
 *      RFC 2473
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 *
 */

#include <linux/module.h>
22
#include <linux/capability.h>
L
Linus Torvalds 已提交
23 24 25
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sockios.h>
26
#include <linux/icmp.h>
L
Linus Torvalds 已提交
27 28 29 30 31 32 33 34 35 36 37 38 39
#include <linux/if.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/if_tunnel.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv6.h>
40
#include <linux/slab.h>
L
Linus Torvalds 已提交
41 42 43 44

#include <asm/uaccess.h>
#include <asm/atomic.h>

45
#include <net/icmp.h>
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/ip6_tunnel.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
54 55
#include <net/net_namespace.h>
#include <net/netns/generic.h>
L
Linus Torvalds 已提交
56 57

MODULE_AUTHOR("Ville Nuorvala");
58
MODULE_DESCRIPTION("IPv6 tunneling device");
L
Linus Torvalds 已提交
59
MODULE_LICENSE("GPL");
S
stephen hemminger 已提交
60
MODULE_ALIAS_NETDEV("ip6tnl0");
L
Linus Torvalds 已提交
61 62

#ifdef IP6_TNL_DEBUG
63
#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __func__)
L
Linus Torvalds 已提交
64 65 66 67 68
#else
#define IP6_TNL_TRACE(x...) do {;} while(0)
#endif

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
69
#define IPV6_TCLASS_SHIFT 20
L
Linus Torvalds 已提交
70 71 72

#define HASH_SIZE  32

A
Al Viro 已提交
73
#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
74 75
		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
		    (HASH_SIZE - 1))
L
Linus Torvalds 已提交
76

E
Eric Dumazet 已提交
77
static int ip6_tnl_dev_init(struct net_device *dev);
78
static void ip6_tnl_dev_setup(struct net_device *dev);
L
Linus Torvalds 已提交
79

80
static int ip6_tnl_net_id __read_mostly;
81
struct ip6_tnl_net {
82 83
	/* the IPv6 tunnel fallback device */
	struct net_device *fb_tnl_dev;
84
	/* lists for storing tunnels in use */
E
Eric Dumazet 已提交
85 86 87
	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
	struct ip6_tnl __rcu *tnls_wc[1];
	struct ip6_tnl __rcu **tnls[2];
88 89
};

E
Eric Dumazet 已提交
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
	unsigned long	rx_packets;
	unsigned long	rx_bytes;
	unsigned long	tx_packets;
	unsigned long	tx_bytes;
};

static struct net_device_stats *ip6_get_stats(struct net_device *dev)
{
	struct pcpu_tstats sum = { 0 };
	int i;

	for_each_possible_cpu(i) {
		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);

		sum.rx_packets += tstats->rx_packets;
		sum.rx_bytes   += tstats->rx_bytes;
		sum.tx_packets += tstats->tx_packets;
		sum.tx_bytes   += tstats->tx_bytes;
	}
	dev->stats.rx_packets = sum.rx_packets;
	dev->stats.rx_bytes   = sum.rx_bytes;
	dev->stats.tx_packets = sum.tx_packets;
	dev->stats.tx_bytes   = sum.tx_bytes;
	return &dev->stats;
}

118
/*
E
Eric Dumazet 已提交
119
 * Locking : hash tables are protected by RCU and RTNL
120
 */
L
Linus Torvalds 已提交
121 122 123 124 125

static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
{
	struct dst_entry *dst = t->dst_cache;

126
	if (dst && dst->obsolete &&
L
Linus Torvalds 已提交
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
	    dst->ops->check(dst, t->dst_cookie) == NULL) {
		t->dst_cache = NULL;
		dst_release(dst);
		return NULL;
	}

	return dst;
}

static inline void ip6_tnl_dst_reset(struct ip6_tnl *t)
{
	dst_release(t->dst_cache);
	t->dst_cache = NULL;
}

static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
	dst_release(t->dst_cache);
	t->dst_cache = dst;
}

/**
151
 * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
152 153
 *   @remote: the address of the tunnel exit-point
 *   @local: the address of the tunnel entry-point
L
Linus Torvalds 已提交
154
 *
155
 * Return:
L
Linus Torvalds 已提交
156
 *   tunnel matching given end-points if found,
157
 *   else fallback tunnel if its device is up,
L
Linus Torvalds 已提交
158 159 160
 *   else %NULL
 **/

161 162 163
#define for_each_ip6_tunnel_rcu(start) \
	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))

L
Linus Torvalds 已提交
164
static struct ip6_tnl *
165
ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
L
Linus Torvalds 已提交
166
{
E
Eric Dumazet 已提交
167 168
	unsigned int h0 = HASH(remote);
	unsigned int h1 = HASH(local);
L
Linus Torvalds 已提交
169
	struct ip6_tnl *t;
170
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
171

172
	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) {
L
Linus Torvalds 已提交
173 174 175 176 177
		if (ipv6_addr_equal(local, &t->parms.laddr) &&
		    ipv6_addr_equal(remote, &t->parms.raddr) &&
		    (t->dev->flags & IFF_UP))
			return t;
	}
178 179
	t = rcu_dereference(ip6n->tnls_wc[0]);
	if (t && (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
180 181 182 183 184 185
		return t;

	return NULL;
}

/**
186
 * ip6_tnl_bucket - get head of list matching given tunnel parameters
187
 *   @p: parameters containing tunnel end-points
L
Linus Torvalds 已提交
188 189
 *
 * Description:
190
 *   ip6_tnl_bucket() returns the head of the list matching the
L
Linus Torvalds 已提交
191 192
 *   &struct in6_addr entries laddr and raddr in @p.
 *
193
 * Return: head of IPv6 tunnel list
L
Linus Torvalds 已提交
194 195
 **/

E
Eric Dumazet 已提交
196
static struct ip6_tnl __rcu **
197
ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
198
{
199 200
	const struct in6_addr *remote = &p->raddr;
	const struct in6_addr *local = &p->laddr;
L
Linus Torvalds 已提交
201 202 203 204 205 206 207
	unsigned h = 0;
	int prio = 0;

	if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
		prio = 1;
		h = HASH(remote) ^ HASH(local);
	}
208
	return &ip6n->tnls[prio][h];
L
Linus Torvalds 已提交
209 210 211
}

/**
212
 * ip6_tnl_link - add tunnel to hash table
L
Linus Torvalds 已提交
213 214 215 216
 *   @t: tunnel to be added
 **/

static void
217
ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
L
Linus Torvalds 已提交
218
{
E
Eric Dumazet 已提交
219
	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
L
Linus Torvalds 已提交
220

E
Eric Dumazet 已提交
221
	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
222
	rcu_assign_pointer(*tp, t);
L
Linus Torvalds 已提交
223 224 225
}

/**
226
 * ip6_tnl_unlink - remove tunnel from hash table
L
Linus Torvalds 已提交
227 228 229 230
 *   @t: tunnel to be removed
 **/

static void
231
ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
L
Linus Torvalds 已提交
232
{
E
Eric Dumazet 已提交
233 234 235 236 237 238 239 240
	struct ip6_tnl __rcu **tp;
	struct ip6_tnl *iter;

	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
	     (iter = rtnl_dereference(*tp)) != NULL;
	     tp = &iter->next) {
		if (t == iter) {
			rcu_assign_pointer(*tp, t->next);
L
Linus Torvalds 已提交
241 242 243 244 245
			break;
		}
	}
}

E
Eric Dumazet 已提交
246 247 248 249 250 251
static void ip6_dev_free(struct net_device *dev)
{
	free_percpu(dev->tstats);
	free_netdev(dev);
}

L
Linus Torvalds 已提交
252 253 254 255 256 257 258
/**
 * ip6_tnl_create() - create a new tunnel
 *   @p: tunnel parameters
 *   @pt: pointer to new tunnel
 *
 * Description:
 *   Create tunnel matching given parameters.
259 260
 *
 * Return:
261
 *   created tunnel or NULL
L
Linus Torvalds 已提交
262 263
 **/

264
static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
265 266 267 268 269
{
	struct net_device *dev;
	struct ip6_tnl *t;
	char name[IFNAMSIZ];
	int err;
270
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
271

272
	if (p->name[0])
L
Linus Torvalds 已提交
273
		strlcpy(name, p->name, IFNAMSIZ);
274 275 276
	else
		sprintf(name, "ip6tnl%%d");

277
	dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup);
L
Linus Torvalds 已提交
278
	if (dev == NULL)
279
		goto failed;
L
Linus Torvalds 已提交
280

281 282
	dev_net_set(dev, net);

283 284 285 286 287
	if (strchr(name, '%')) {
		if (dev_alloc_name(dev, name) < 0)
			goto failed_free;
	}

288
	t = netdev_priv(dev);
L
Linus Torvalds 已提交
289
	t->parms = *p;
E
Eric Dumazet 已提交
290 291 292
	err = ip6_tnl_dev_init(dev);
	if (err < 0)
		goto failed_free;
L
Linus Torvalds 已提交
293

294 295 296
	if ((err = register_netdevice(dev)) < 0)
		goto failed_free;

L
Linus Torvalds 已提交
297
	dev_hold(dev);
298
	ip6_tnl_link(ip6n, t);
299
	return t;
300 301

failed_free:
E
Eric Dumazet 已提交
302
	ip6_dev_free(dev);
303 304
failed:
	return NULL;
L
Linus Torvalds 已提交
305 306 307
}

/**
308
 * ip6_tnl_locate - find or create tunnel matching given parameters
309
 *   @p: tunnel parameters
L
Linus Torvalds 已提交
310 311 312
 *   @create: != 0 if allowed to create new tunnel if no match found
 *
 * Description:
313
 *   ip6_tnl_locate() first tries to locate an existing tunnel
L
Linus Torvalds 已提交
314 315 316 317
 *   based on @parms. If this is unsuccessful, but @create is set a new
 *   tunnel device is created and registered for use.
 *
 * Return:
318
 *   matching tunnel or NULL
L
Linus Torvalds 已提交
319 320
 **/

321 322
static struct ip6_tnl *ip6_tnl_locate(struct net *net,
		struct ip6_tnl_parm *p, int create)
L
Linus Torvalds 已提交
323
{
324 325
	const struct in6_addr *remote = &p->raddr;
	const struct in6_addr *local = &p->laddr;
E
Eric Dumazet 已提交
326
	struct ip6_tnl __rcu **tp;
L
Linus Torvalds 已提交
327
	struct ip6_tnl *t;
328
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
329

E
Eric Dumazet 已提交
330 331 332
	for (tp = ip6_tnl_bucket(ip6n, p);
	     (t = rtnl_dereference(*tp)) != NULL;
	     tp = &t->next) {
L
Linus Torvalds 已提交
333
		if (ipv6_addr_equal(local, &t->parms.laddr) &&
334 335
		    ipv6_addr_equal(remote, &t->parms.raddr))
			return t;
L
Linus Torvalds 已提交
336 337
	}
	if (!create)
338
		return NULL;
339
	return ip6_tnl_create(net, p);
L
Linus Torvalds 已提交
340 341 342
}

/**
343
 * ip6_tnl_dev_uninit - tunnel device uninitializer
L
Linus Torvalds 已提交
344
 *   @dev: the device to be destroyed
345
 *
L
Linus Torvalds 已提交
346
 * Description:
347
 *   ip6_tnl_dev_uninit() removes tunnel from its list
L
Linus Torvalds 已提交
348 349 350
 **/

static void
351
ip6_tnl_dev_uninit(struct net_device *dev)
L
Linus Torvalds 已提交
352
{
353
	struct ip6_tnl *t = netdev_priv(dev);
354 355
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
356

E
Eric Dumazet 已提交
357 358 359
	if (dev == ip6n->fb_tnl_dev)
		rcu_assign_pointer(ip6n->tnls_wc[0], NULL);
	else
360
		ip6_tnl_unlink(ip6n, t);
L
Linus Torvalds 已提交
361 362 363 364 365 366 367 368
	ip6_tnl_dst_reset(t);
	dev_put(dev);
}

/**
 * parse_tvl_tnl_enc_lim - handle encapsulation limit option
 *   @skb: received socket buffer
 *
369 370
 * Return:
 *   0 if none was found,
L
Linus Torvalds 已提交
371 372 373 374 375 376
 *   else index to encapsulation limit
 **/

static __u16
parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
{
377
	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
L
Linus Torvalds 已提交
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
	__u8 nexthdr = ipv6h->nexthdr;
	__u16 off = sizeof (*ipv6h);

	while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
		__u16 optlen = 0;
		struct ipv6_opt_hdr *hdr;
		if (raw + off + sizeof (*hdr) > skb->data &&
		    !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
			break;

		hdr = (struct ipv6_opt_hdr *) (raw + off);
		if (nexthdr == NEXTHDR_FRAGMENT) {
			struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
			if (frag_hdr->frag_off)
				break;
			optlen = 8;
		} else if (nexthdr == NEXTHDR_AUTH) {
			optlen = (hdr->hdrlen + 2) << 2;
		} else {
			optlen = ipv6_optlen(hdr);
		}
		if (nexthdr == NEXTHDR_DEST) {
			__u16 i = off + 2;
			while (1) {
				struct ipv6_tlv_tnl_enc_lim *tel;

				/* No more room for encapsulation limit */
				if (i + sizeof (*tel) > off + optlen)
					break;

				tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
				/* return index of option if found and valid */
				if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
				    tel->length == 1)
					return i;
				/* else jump to next option */
				if (tel->type)
					i += tel->length + 2;
				else
					i++;
			}
		}
		nexthdr = hdr->nexthdr;
		off += optlen;
	}
	return 0;
}

/**
427
 * ip6_tnl_err - tunnel error handler
L
Linus Torvalds 已提交
428 429
 *
 * Description:
430
 *   ip6_tnl_err() should handle errors in the tunnel according
L
Linus Torvalds 已提交
431 432 433
 *   to the specifications in RFC 2473.
 **/

H
Herbert Xu 已提交
434
static int
435
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
436
	    u8 *type, u8 *code, int *msg, __u32 *info, int offset)
L
Linus Torvalds 已提交
437
{
438
	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data;
L
Linus Torvalds 已提交
439 440
	struct ip6_tnl *t;
	int rel_msg = 0;
441 442
	u8 rel_type = ICMPV6_DEST_UNREACH;
	u8 rel_code = ICMPV6_ADDR_UNREACH;
L
Linus Torvalds 已提交
443 444
	__u32 rel_info = 0;
	__u16 len;
H
Herbert Xu 已提交
445
	int err = -ENOENT;
L
Linus Torvalds 已提交
446

447 448
	/* If the packet doesn't contain the original IPv6 header we are
	   in trouble since we might need the source address for further
L
Linus Torvalds 已提交
449 450
	   processing of the error. */

451
	rcu_read_lock();
452
	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr,
453
					&ipv6h->saddr)) == NULL)
L
Linus Torvalds 已提交
454 455
		goto out;

456 457 458
	if (t->parms.proto != ipproto && t->parms.proto != 0)
		goto out;

H
Herbert Xu 已提交
459 460
	err = 0;

461
	switch (*type) {
L
Linus Torvalds 已提交
462 463 464 465 466 467 468 469 470 471 472
		__u32 teli;
		struct ipv6_tlv_tnl_enc_lim *tel;
		__u32 mtu;
	case ICMPV6_DEST_UNREACH:
		if (net_ratelimit())
			printk(KERN_WARNING
			       "%s: Path to destination invalid "
			       "or inactive!\n", t->parms.name);
		rel_msg = 1;
		break;
	case ICMPV6_TIME_EXCEED:
473
		if ((*code) == ICMPV6_EXC_HOPLIMIT) {
L
Linus Torvalds 已提交
474 475 476
			if (net_ratelimit())
				printk(KERN_WARNING
				       "%s: Too small hop limit or "
477
				       "routing loop in tunnel!\n",
L
Linus Torvalds 已提交
478 479 480 481 482
				       t->parms.name);
			rel_msg = 1;
		}
		break;
	case ICMPV6_PARAMPROB:
483
		teli = 0;
484
		if ((*code) == ICMPV6_HDR_FIELD)
485
			teli = parse_tlv_tnl_enc_lim(skb, skb->data);
L
Linus Torvalds 已提交
486

A
Al Viro 已提交
487
		if (teli && teli == *info - 2) {
L
Linus Torvalds 已提交
488 489 490 491 492 493 494 495 496
			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
			if (tel->encap_limit == 0) {
				if (net_ratelimit())
					printk(KERN_WARNING
					       "%s: Too small encapsulation "
					       "limit or routing loop in "
					       "tunnel!\n", t->parms.name);
				rel_msg = 1;
			}
497 498 499 500
		} else if (net_ratelimit()) {
			printk(KERN_WARNING
			       "%s: Recipient unable to parse tunneled "
			       "packet!\n ", t->parms.name);
L
Linus Torvalds 已提交
501 502 503
		}
		break;
	case ICMPV6_PKT_TOOBIG:
A
Al Viro 已提交
504
		mtu = *info - offset;
L
Linus Torvalds 已提交
505 506 507 508
		if (mtu < IPV6_MIN_MTU)
			mtu = IPV6_MIN_MTU;
		t->dev->mtu = mtu;

A
Al Viro 已提交
509
		if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) {
L
Linus Torvalds 已提交
510 511 512 513 514 515 516
			rel_type = ICMPV6_PKT_TOOBIG;
			rel_code = 0;
			rel_info = mtu;
			rel_msg = 1;
		}
		break;
	}
517 518 519 520 521 522 523

	*type = rel_type;
	*code = rel_code;
	*info = rel_info;
	*msg = rel_msg;

out:
524
	rcu_read_unlock();
525 526 527
	return err;
}

528 529
static int
ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
530
	   u8 type, u8 code, int offset, __be32 info)
531 532
{
	int rel_msg = 0;
533 534
	u8 rel_type = type;
	u8 rel_code = code;
A
Al Viro 已提交
535
	__u32 rel_info = ntohl(info);
536 537
	int err;
	struct sk_buff *skb2;
538
	const struct iphdr *eiph;
539 540
	struct rtable *rt;

541 542
	err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
			  &rel_msg, &rel_info, offset);
543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572
	if (err < 0)
		return err;

	if (rel_msg == 0)
		return 0;

	switch (rel_type) {
	case ICMPV6_DEST_UNREACH:
		if (rel_code != ICMPV6_ADDR_UNREACH)
			return 0;
		rel_type = ICMP_DEST_UNREACH;
		rel_code = ICMP_HOST_UNREACH;
		break;
	case ICMPV6_PKT_TOOBIG:
		if (rel_code != 0)
			return 0;
		rel_type = ICMP_DEST_UNREACH;
		rel_code = ICMP_FRAG_NEEDED;
		break;
	default:
		return 0;
	}

	if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
		return 0;

	skb2 = skb_clone(skb, GFP_ATOMIC);
	if (!skb2)
		return 0;

E
Eric Dumazet 已提交
573 574
	skb_dst_drop(skb2);

575
	skb_pull(skb2, offset);
576
	skb_reset_network_header(skb2);
577
	eiph = ip_hdr(skb2);
578 579

	/* Try to guess incoming interface */
580 581 582 583
	rt = ip_route_output_ports(dev_net(skb->dev), NULL,
				   eiph->saddr, 0,
				   0, 0,
				   IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
584
	if (IS_ERR(rt))
585 586
		goto out;

587
	skb2->dev = rt->dst.dev;
588 589 590 591 592

	/* route "incoming" packet */
	if (rt->rt_flags & RTCF_LOCAL) {
		ip_rt_put(rt);
		rt = NULL;
593 594 595 596 597
		rt = ip_route_output_ports(dev_net(skb->dev), NULL,
					   eiph->daddr, eiph->saddr,
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(eiph->tos), 0);
598
		if (IS_ERR(rt) ||
599
		    rt->dst.dev->type != ARPHRD_TUNNEL) {
600 601
			if (!IS_ERR(rt))
				ip_rt_put(rt);
602 603
			goto out;
		}
604
		skb_dst_set(skb2, &rt->dst);
605 606 607 608
	} else {
		ip_rt_put(rt);
		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
				   skb2->dev) ||
E
Eric Dumazet 已提交
609
		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
610 611 612 613 614
			goto out;
	}

	/* change mtu on this route */
	if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
E
Eric Dumazet 已提交
615
		if (rel_info > dst_mtu(skb_dst(skb2)))
616 617
			goto out;

E
Eric Dumazet 已提交
618
		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
619 620
	}

A
Al Viro 已提交
621
	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
622 623 624 625 626 627

out:
	kfree_skb(skb2);
	return 0;
}

628 629
static int
ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
630
	   u8 type, u8 code, int offset, __be32 info)
631 632
{
	int rel_msg = 0;
633 634
	u8 rel_type = type;
	u8 rel_code = code;
A
Al Viro 已提交
635
	__u32 rel_info = ntohl(info);
636 637
	int err;

638 639
	err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
			  &rel_msg, &rel_info, offset);
640 641 642 643
	if (err < 0)
		return err;

	if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
L
Linus Torvalds 已提交
644 645
		struct rt6_info *rt;
		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
646

L
Linus Torvalds 已提交
647
		if (!skb2)
648
			return 0;
L
Linus Torvalds 已提交
649

E
Eric Dumazet 已提交
650
		skb_dst_drop(skb2);
L
Linus Torvalds 已提交
651
		skb_pull(skb2, offset);
652
		skb_reset_network_header(skb2);
L
Linus Torvalds 已提交
653 654

		/* Try to guess incoming interface */
655 656
		rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
				NULL, 0, 0);
L
Linus Torvalds 已提交
657 658 659 660

		if (rt && rt->rt6i_dev)
			skb2->dev = rt->rt6i_dev;

661
		icmpv6_send(skb2, rel_type, rel_code, rel_info);
L
Linus Torvalds 已提交
662 663

		if (rt)
664
			dst_release(&rt->dst);
L
Linus Torvalds 已提交
665 666 667

		kfree_skb(skb2);
	}
668 669

	return 0;
L
Linus Torvalds 已提交
670 671
}

672 673
static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
					const struct ipv6hdr *ipv6h,
674 675 676 677 678
					struct sk_buff *skb)
{
	__u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;

	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
679
		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
680 681

	if (INET_ECN_is_ce(dsfield))
682
		IP_ECN_set_ce(ip_hdr(skb));
683 684
}

685 686
static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
					const struct ipv6hdr *ipv6h,
687
					struct sk_buff *skb)
L
Linus Torvalds 已提交
688
{
689
	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
690
		ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
L
Linus Torvalds 已提交
691

692
	if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h)))
693
		IP6_ECN_set_ce(ipv6_hdr(skb));
L
Linus Torvalds 已提交
694
}
695

E
Eric Dumazet 已提交
696
/* called with rcu_read_lock() */
697 698 699 700
static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t)
{
	struct ip6_tnl_parm *p = &t->parms;
	int ret = 0;
701
	struct net *net = dev_net(t->dev);
702 703

	if (p->flags & IP6_TNL_F_CAP_RCV) {
704
		struct net_device *ldev = NULL;
705 706

		if (p->link)
E
Eric Dumazet 已提交
707
			ldev = dev_get_by_index_rcu(net, p->link);
708 709

		if ((ipv6_addr_is_multicast(&p->laddr) ||
710 711
		     likely(ipv6_chk_addr(net, &p->laddr, ldev, 0))) &&
		    likely(!ipv6_chk_addr(net, &p->raddr, NULL, 0)))
712 713 714 715 716
			ret = 1;

	}
	return ret;
}
L
Linus Torvalds 已提交
717 718

/**
719
 * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
L
Linus Torvalds 已提交
720
 *   @skb: received socket buffer
721 722
 *   @protocol: ethernet protocol ID
 *   @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN
L
Linus Torvalds 已提交
723 724 725 726
 *
 * Return: 0
 **/

727
static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
728
		       __u8 ipproto,
729 730
		       void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
						    const struct ipv6hdr *ipv6h,
731
						    struct sk_buff *skb))
L
Linus Torvalds 已提交
732 733
{
	struct ip6_tnl *t;
734
	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
L
Linus Torvalds 已提交
735

736
	rcu_read_lock();
L
Linus Torvalds 已提交
737

738
	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
739
					&ipv6h->daddr)) != NULL) {
E
Eric Dumazet 已提交
740 741
		struct pcpu_tstats *tstats;

742
		if (t->parms.proto != ipproto && t->parms.proto != 0) {
743
			rcu_read_unlock();
744 745 746
			goto discard;
		}

L
Linus Torvalds 已提交
747
		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
748
			rcu_read_unlock();
749
			goto discard;
L
Linus Torvalds 已提交
750 751
		}

752
		if (!ip6_tnl_rcv_ctl(t)) {
753
			t->dev->stats.rx_dropped++;
754
			rcu_read_unlock();
L
Linus Torvalds 已提交
755 756 757
			goto discard;
		}
		secpath_reset(skb);
758
		skb->mac_header = skb->network_header;
759
		skb_reset_network_header(skb);
760
		skb->protocol = htons(protocol);
L
Linus Torvalds 已提交
761 762
		skb->pkt_type = PACKET_HOST;
		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
763

E
Eric Dumazet 已提交
764 765 766 767 768
		tstats = this_cpu_ptr(t->dev->tstats);
		tstats->rx_packets++;
		tstats->rx_bytes += skb->len;

		__skb_tunnel_rx(skb, t->dev);
769

770
		dscp_ecn_decapsulate(t, ipv6h, skb);
E
Eric Dumazet 已提交
771

772
		netif_rx(skb);
E
Eric Dumazet 已提交
773

774
		rcu_read_unlock();
L
Linus Torvalds 已提交
775 776
		return 0;
	}
777
	rcu_read_unlock();
L
Linus Torvalds 已提交
778
	return 1;
779 780 781 782

discard:
	kfree_skb(skb);
	return 0;
L
Linus Torvalds 已提交
783 784
}

785 786
static int ip4ip6_rcv(struct sk_buff *skb)
{
787 788
	return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP,
			   ip4ip6_dscp_ecn_decapsulate);
789 790
}

791 792
static int ip6ip6_rcv(struct sk_buff *skb)
{
793 794
	return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6,
			   ip6ip6_dscp_ecn_decapsulate);
795 796
}

797 798 799 800
struct ipv6_tel_txoption {
	struct ipv6_txoptions ops;
	__u8 dst_opt[8];
};
L
Linus Torvalds 已提交
801

802 803 804
static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
{
	memset(opt, 0, sizeof(struct ipv6_tel_txoption));
L
Linus Torvalds 已提交
805

806 807 808 809 810
	opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
	opt->dst_opt[3] = 1;
	opt->dst_opt[4] = encap_limit;
	opt->dst_opt[5] = IPV6_TLV_PADN;
	opt->dst_opt[6] = 1;
L
Linus Torvalds 已提交
811

812 813
	opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
	opt->ops.opt_nflen = 8;
L
Linus Torvalds 已提交
814 815 816
}

/**
817
 * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
L
Linus Torvalds 已提交
818
 *   @t: the outgoing tunnel device
819
 *   @hdr: IPv6 header from the incoming packet
L
Linus Torvalds 已提交
820 821
 *
 * Description:
822
 *   Avoid trivial tunneling loop by checking that tunnel exit-point
L
Linus Torvalds 已提交
823 824
 *   doesn't match source of incoming packet.
 *
825
 * Return:
L
Linus Torvalds 已提交
826 827 828 829 830
 *   1 if conflict,
 *   0 else
 **/

static inline int
831
ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
L
Linus Torvalds 已提交
832 833 834 835
{
	return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
}

836 837 838 839
static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
{
	struct ip6_tnl_parm *p = &t->parms;
	int ret = 0;
840
	struct net *net = dev_net(t->dev);
841

842
	if (p->flags & IP6_TNL_F_CAP_XMIT) {
843 844
		struct net_device *ldev = NULL;

E
Eric Dumazet 已提交
845
		rcu_read_lock();
846
		if (p->link)
E
Eric Dumazet 已提交
847
			ldev = dev_get_by_index_rcu(net, p->link);
848

849
		if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0)))
850 851 852 853
			printk(KERN_WARNING
			       "%s xmit: Local address not yet configured!\n",
			       p->name);
		else if (!ipv6_addr_is_multicast(&p->raddr) &&
854
			 unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0)))
855 856 857 858 859 860
			printk(KERN_WARNING
			       "%s xmit: Routing loop! "
			       "Remote address found on this node!\n",
			       p->name);
		else
			ret = 1;
E
Eric Dumazet 已提交
861
		rcu_read_unlock();
862 863 864
	}
	return ret;
}
L
Linus Torvalds 已提交
865
/**
866
 * ip6_tnl_xmit2 - encapsulate packet and send
L
Linus Torvalds 已提交
867
 *   @skb: the outgoing socket buffer
868
 *   @dev: the outgoing tunnel device
869 870 871 872
 *   @dsfield: dscp code for outer header
 *   @fl: flow of tunneled packet
 *   @encap_limit: encapsulation limit
 *   @pmtu: Path MTU is stored if packet is too big
L
Linus Torvalds 已提交
873 874 875 876 877
 *
 * Description:
 *   Build new header and do some sanity checks on the packet before sending
 *   it.
 *
878
 * Return:
879
 *   0 on success
880 881
 *   -1 fail
 *   %-EMSGSIZE message too big. return mtu in this case.
L
Linus Torvalds 已提交
882 883
 **/

884 885 886
static int ip6_tnl_xmit2(struct sk_buff *skb,
			 struct net_device *dev,
			 __u8 dsfield,
887
			 struct flowi6 *fl6,
888 889
			 int encap_limit,
			 __u32 *pmtu)
L
Linus Torvalds 已提交
890
{
A
Alexey Dobriyan 已提交
891
	struct net *net = dev_net(dev);
892
	struct ip6_tnl *t = netdev_priv(dev);
893
	struct net_device_stats *stats = &t->dev->stats;
894
	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
895
	struct ipv6_tel_txoption opt;
L
Linus Torvalds 已提交
896 897 898
	struct dst_entry *dst;
	struct net_device *tdev;
	int mtu;
899
	unsigned int max_headroom = sizeof(struct ipv6hdr);
L
Linus Torvalds 已提交
900
	u8 proto;
901
	int err = -1;
L
Linus Torvalds 已提交
902 903 904 905
	int pkt_len;

	if ((dst = ip6_tnl_dst_check(t)) != NULL)
		dst_hold(dst);
906
	else {
907
		dst = ip6_route_output(net, NULL, fl6);
L
Linus Torvalds 已提交
908

909
		if (dst->error)
910
			goto tx_err_link_failure;
911
		dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
912 913 914 915 916
		if (IS_ERR(dst)) {
			err = PTR_ERR(dst);
			dst = NULL;
			goto tx_err_link_failure;
		}
917
	}
L
Linus Torvalds 已提交
918 919 920 921 922 923

	tdev = dst->dev;

	if (tdev == dev) {
		stats->collisions++;
		if (net_ratelimit())
924
			printk(KERN_WARNING
L
Linus Torvalds 已提交
925 926 927 928 929
			       "%s: Local routing loop detected!\n",
			       t->parms.name);
		goto tx_err_dst_release;
	}
	mtu = dst_mtu(dst) - sizeof (*ipv6h);
930
	if (encap_limit >= 0) {
L
Linus Torvalds 已提交
931 932 933 934 935
		max_headroom += 8;
		mtu -= 8;
	}
	if (mtu < IPV6_MIN_MTU)
		mtu = IPV6_MIN_MTU;
E
Eric Dumazet 已提交
936 937
	if (skb_dst(skb))
		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
L
Linus Torvalds 已提交
938
	if (skb->len > mtu) {
939 940
		*pmtu = mtu;
		err = -EMSGSIZE;
L
Linus Torvalds 已提交
941 942 943 944 945 946 947
		goto tx_err_dst_release;
	}

	/*
	 * Okay, now see if we can stuff it in the buffer as-is.
	 */
	max_headroom += LL_RESERVED_SPACE(tdev);
948

949 950
	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
L
Linus Torvalds 已提交
951
		struct sk_buff *new_skb;
952

L
Linus Torvalds 已提交
953 954 955 956 957 958 959 960
		if (!(new_skb = skb_realloc_headroom(skb, max_headroom)))
			goto tx_err_dst_release;

		if (skb->sk)
			skb_set_owner_w(new_skb, skb->sk);
		kfree_skb(skb);
		skb = new_skb;
	}
E
Eric Dumazet 已提交
961 962
	skb_dst_drop(skb);
	skb_dst_set(skb, dst_clone(dst));
L
Linus Torvalds 已提交
963

964
	skb->transport_header = skb->network_header;
L
Linus Torvalds 已提交
965

966
	proto = fl6->flowi6_proto;
967 968 969 970
	if (encap_limit >= 0) {
		init_tel_txopt(&opt, encap_limit);
		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
	}
971 972
	skb_push(skb, sizeof(struct ipv6hdr));
	skb_reset_network_header(skb);
973
	ipv6h = ipv6_hdr(skb);
974
	*(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000);
L
Linus Torvalds 已提交
975 976 977 978
	dsfield = INET_ECN_encapsulate(0, dsfield);
	ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield);
	ipv6h->hop_limit = t->parms.hop_limit;
	ipv6h->nexthdr = proto;
979 980
	ipv6_addr_copy(&ipv6h->saddr, &fl6->saddr);
	ipv6_addr_copy(&ipv6h->daddr, &fl6->daddr);
L
Linus Torvalds 已提交
981 982
	nf_reset(skb);
	pkt_len = skb->len;
H
Herbert Xu 已提交
983
	err = ip6_local_out(skb);
L
Linus Torvalds 已提交
984

985
	if (net_xmit_eval(err) == 0) {
E
Eric Dumazet 已提交
986 987 988 989
		struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);

		tstats->tx_bytes += pkt_len;
		tstats->tx_packets++;
L
Linus Torvalds 已提交
990 991 992 993 994 995 996 997 998 999 1000
	} else {
		stats->tx_errors++;
		stats->tx_aborted_errors++;
	}
	ip6_tnl_dst_store(t, dst);
	return 0;
tx_err_link_failure:
	stats->tx_carrier_errors++;
	dst_link_failure(skb);
tx_err_dst_release:
	dst_release(dst);
1001 1002 1003
	return err;
}

1004 1005 1006 1007
static inline int
ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1008
	const struct iphdr  *iph = ip_hdr(skb);
1009
	int encap_limit = -1;
1010
	struct flowi6 fl6;
1011 1012 1013 1014
	__u8 dsfield;
	__u32 mtu;
	int err;

1015 1016
	if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) ||
	    !ip6_tnl_xmit_ctl(t))
1017 1018 1019 1020 1021
		return -1;

	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		encap_limit = t->parms.encap_limit;

1022 1023
	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
	fl6.flowi6_proto = IPPROTO_IPIP;
1024 1025 1026 1027

	dsfield = ipv4_get_dsfield(iph);

	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
1028
		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
A
Al Viro 已提交
1029
					  & IPV6_TCLASS_MASK;
1030

1031
	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
	if (err != 0) {
		/* XXX: send ICMP error even if DF is not set. */
		if (err == -EMSGSIZE)
			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
				  htonl(mtu));
		return -1;
	}

	return 0;
}

1043 1044 1045 1046
static inline int
ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1047
	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
1048 1049
	int encap_limit = -1;
	__u16 offset;
1050
	struct flowi6 fl6;
1051 1052 1053 1054
	__u8 dsfield;
	__u32 mtu;
	int err;

1055 1056
	if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
	    !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h))
1057 1058
		return -1;

1059 1060
	offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb));
	if (offset > 0) {
1061
		struct ipv6_tlv_tnl_enc_lim *tel;
1062
		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
1063 1064
		if (tel->encap_limit == 0) {
			icmpv6_send(skb, ICMPV6_PARAMPROB,
1065
				    ICMPV6_HDR_FIELD, offset + 2);
1066 1067 1068 1069 1070 1071
			return -1;
		}
		encap_limit = tel->encap_limit - 1;
	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		encap_limit = t->parms.encap_limit;

1072 1073
	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
	fl6.flowi6_proto = IPPROTO_IPV6;
1074 1075 1076

	dsfield = ipv6_get_dsfield(ipv6h);
	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS))
1077
		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
1078
	if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL))
1079
		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
1080

1081
	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
1082 1083
	if (err != 0) {
		if (err == -EMSGSIZE)
1084
			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1085 1086 1087 1088 1089 1090
		return -1;
	}

	return 0;
}

1091
static netdev_tx_t
1092 1093 1094
ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1095
	struct net_device_stats *stats = &t->dev->stats;
1096 1097 1098
	int ret;

	switch (skb->protocol) {
1099
	case htons(ETH_P_IP):
1100 1101
		ret = ip4ip6_tnl_xmit(skb, dev);
		break;
1102
	case htons(ETH_P_IPV6):
1103 1104 1105 1106 1107 1108 1109 1110 1111
		ret = ip6ip6_tnl_xmit(skb, dev);
		break;
	default:
		goto tx_err;
	}

	if (ret < 0)
		goto tx_err;

1112
	return NETDEV_TX_OK;
1113

L
Linus Torvalds 已提交
1114 1115 1116 1117
tx_err:
	stats->tx_errors++;
	stats->tx_dropped++;
	kfree_skb(skb);
1118
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
1119 1120 1121 1122 1123
}

static void ip6_tnl_set_cap(struct ip6_tnl *t)
{
	struct ip6_tnl_parm *p = &t->parms;
1124 1125
	int ltype = ipv6_addr_type(&p->laddr);
	int rtype = ipv6_addr_type(&p->raddr);
L
Linus Torvalds 已提交
1126 1127 1128

	p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV);

1129 1130 1131
	if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
	    rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
	    !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
1132
	    (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
1133 1134 1135 1136
		if (ltype&IPV6_ADDR_UNICAST)
			p->flags |= IP6_TNL_F_CAP_XMIT;
		if (rtype&IPV6_ADDR_UNICAST)
			p->flags |= IP6_TNL_F_CAP_RCV;
L
Linus Torvalds 已提交
1137 1138 1139
	}
}

1140
static void ip6_tnl_link_config(struct ip6_tnl *t)
L
Linus Torvalds 已提交
1141 1142 1143
{
	struct net_device *dev = t->dev;
	struct ip6_tnl_parm *p = &t->parms;
1144
	struct flowi6 *fl6 = &t->fl.u.ip6;
L
Linus Torvalds 已提交
1145

1146 1147
	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
L
Linus Torvalds 已提交
1148 1149

	/* Set up flowi template */
1150 1151 1152 1153
	ipv6_addr_copy(&fl6->saddr, &p->laddr);
	ipv6_addr_copy(&fl6->daddr, &p->raddr);
	fl6->flowi6_oif = p->link;
	fl6->flowlabel = 0;
L
Linus Torvalds 已提交
1154 1155

	if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
1156
		fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
L
Linus Torvalds 已提交
1157
	if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
1158
		fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
L
Linus Torvalds 已提交
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169

	ip6_tnl_set_cap(t);

	if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
		dev->flags |= IFF_POINTOPOINT;
	else
		dev->flags &= ~IFF_POINTOPOINT;

	dev->iflink = p->link;

	if (p->flags & IP6_TNL_F_CAP_XMIT) {
1170 1171 1172
		int strict = (ipv6_addr_type(&p->raddr) &
			      (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));

1173 1174
		struct rt6_info *rt = rt6_lookup(dev_net(dev),
						 &p->raddr, &p->laddr,
1175
						 p->link, strict);
L
Linus Torvalds 已提交
1176 1177 1178 1179 1180 1181 1182 1183 1184

		if (rt == NULL)
			return;

		if (rt->rt6i_dev) {
			dev->hard_header_len = rt->rt6i_dev->hard_header_len +
				sizeof (struct ipv6hdr);

			dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr);
1185 1186
			if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
				dev->mtu-=8;
L
Linus Torvalds 已提交
1187 1188 1189 1190

			if (dev->mtu < IPV6_MIN_MTU)
				dev->mtu = IPV6_MIN_MTU;
		}
1191
		dst_release(&rt->dst);
L
Linus Torvalds 已提交
1192 1193 1194 1195
	}
}

/**
1196
 * ip6_tnl_change - update the tunnel parameters
L
Linus Torvalds 已提交
1197 1198 1199 1200
 *   @t: tunnel to be changed
 *   @p: tunnel configuration parameters
 *
 * Description:
1201
 *   ip6_tnl_change() updates the tunnel parameters
L
Linus Torvalds 已提交
1202 1203 1204
 **/

static int
1205
ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
1206 1207 1208 1209 1210 1211 1212
{
	ipv6_addr_copy(&t->parms.laddr, &p->laddr);
	ipv6_addr_copy(&t->parms.raddr, &p->raddr);
	t->parms.flags = p->flags;
	t->parms.hop_limit = p->hop_limit;
	t->parms.encap_limit = p->encap_limit;
	t->parms.flowinfo = p->flowinfo;
1213
	t->parms.link = p->link;
1214
	t->parms.proto = p->proto;
1215
	ip6_tnl_dst_reset(t);
1216
	ip6_tnl_link_config(t);
L
Linus Torvalds 已提交
1217 1218 1219 1220
	return 0;
}

/**
1221
 * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
L
Linus Torvalds 已提交
1222 1223 1224 1225 1226
 *   @dev: virtual device associated with tunnel
 *   @ifr: parameters passed from userspace
 *   @cmd: command to be performed
 *
 * Description:
1227
 *   ip6_tnl_ioctl() is used for managing IPv6 tunnels
1228
 *   from userspace.
L
Linus Torvalds 已提交
1229 1230 1231 1232 1233 1234 1235
 *
 *   The possible commands are the following:
 *     %SIOCGETTUNNEL: get tunnel parameters for device
 *     %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
 *     %SIOCCHGTUNNEL: change tunnel parameters to those given
 *     %SIOCDELTUNNEL: delete tunnel
 *
1236
 *   The fallback device "ip6tnl0", created during module
L
Linus Torvalds 已提交
1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248
 *   initialization, can be used for creating other tunnel devices.
 *
 * Return:
 *   0 on success,
 *   %-EFAULT if unable to copy data to or from userspace,
 *   %-EPERM if current process hasn't %CAP_NET_ADMIN set
 *   %-EINVAL if passed tunnel parameters are invalid,
 *   %-EEXIST if changing a tunnel's parameters would cause a conflict
 *   %-ENODEV if attempting to change or delete a nonexisting device
 **/

static int
1249
ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
L
Linus Torvalds 已提交
1250 1251 1252 1253
{
	int err = 0;
	struct ip6_tnl_parm p;
	struct ip6_tnl *t = NULL;
1254 1255
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
1256 1257 1258

	switch (cmd) {
	case SIOCGETTUNNEL:
1259
		if (dev == ip6n->fb_tnl_dev) {
1260
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) {
L
Linus Torvalds 已提交
1261 1262 1263
				err = -EFAULT;
				break;
			}
1264
			t = ip6_tnl_locate(net, &p, 0);
1265 1266
		}
		if (t == NULL)
1267
			t = netdev_priv(dev);
L
Linus Torvalds 已提交
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277
		memcpy(&p, &t->parms, sizeof (p));
		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) {
			err = -EFAULT;
		}
		break;
	case SIOCADDTUNNEL:
	case SIOCCHGTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			break;
1278 1279
		err = -EFAULT;
		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
L
Linus Torvalds 已提交
1280
			break;
1281
		err = -EINVAL;
1282 1283
		if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
		    p.proto != 0)
L
Linus Torvalds 已提交
1284
			break;
1285
		t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL);
1286
		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
1287 1288 1289 1290 1291 1292 1293 1294
			if (t != NULL) {
				if (t->dev != dev) {
					err = -EEXIST;
					break;
				}
			} else
				t = netdev_priv(dev);

1295
			ip6_tnl_unlink(ip6n, t);
1296
			synchronize_net();
1297
			err = ip6_tnl_change(t, &p);
1298
			ip6_tnl_link(ip6n, t);
L
Linus Torvalds 已提交
1299 1300
			netdev_state_change(dev);
		}
1301
		if (t) {
L
Linus Torvalds 已提交
1302
			err = 0;
1303 1304 1305 1306 1307
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p)))
				err = -EFAULT;

		} else
			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
L
Linus Torvalds 已提交
1308 1309 1310 1311 1312 1313
		break;
	case SIOCDELTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			break;

1314
		if (dev == ip6n->fb_tnl_dev) {
1315 1316
			err = -EFAULT;
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
L
Linus Torvalds 已提交
1317
				break;
1318
			err = -ENOENT;
1319
			if ((t = ip6_tnl_locate(net, &p, 0)) == NULL)
L
Linus Torvalds 已提交
1320
				break;
1321
			err = -EPERM;
1322
			if (t->dev == ip6n->fb_tnl_dev)
L
Linus Torvalds 已提交
1323
				break;
1324
			dev = t->dev;
L
Linus Torvalds 已提交
1325
		}
1326 1327
		err = 0;
		unregister_netdevice(dev);
L
Linus Torvalds 已提交
1328 1329 1330 1331 1332 1333 1334 1335
		break;
	default:
		err = -EINVAL;
	}
	return err;
}

/**
1336
 * ip6_tnl_change_mtu - change mtu manually for tunnel device
L
Linus Torvalds 已提交
1337 1338 1339 1340 1341 1342 1343 1344 1345
 *   @dev: virtual device associated with tunnel
 *   @new_mtu: the new mtu
 *
 * Return:
 *   0 on success,
 *   %-EINVAL if mtu too small
 **/

static int
1346
ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
L
Linus Torvalds 已提交
1347 1348 1349 1350 1351 1352 1353 1354
{
	if (new_mtu < IPV6_MIN_MTU) {
		return -EINVAL;
	}
	dev->mtu = new_mtu;
	return 0;
}

1355 1356

static const struct net_device_ops ip6_tnl_netdev_ops = {
E
Eric Dumazet 已提交
1357
	.ndo_uninit	= ip6_tnl_dev_uninit,
1358
	.ndo_start_xmit = ip6_tnl_xmit,
E
Eric Dumazet 已提交
1359
	.ndo_do_ioctl	= ip6_tnl_ioctl,
1360
	.ndo_change_mtu = ip6_tnl_change_mtu,
E
Eric Dumazet 已提交
1361
	.ndo_get_stats	= ip6_get_stats,
1362 1363
};

E
Eric Dumazet 已提交
1364

L
Linus Torvalds 已提交
1365
/**
1366
 * ip6_tnl_dev_setup - setup virtual tunnel device
L
Linus Torvalds 已提交
1367 1368 1369 1370 1371 1372
 *   @dev: virtual device associated with tunnel
 *
 * Description:
 *   Initialize function pointers and device parameters
 **/

1373
static void ip6_tnl_dev_setup(struct net_device *dev)
L
Linus Torvalds 已提交
1374
{
1375 1376
	struct ip6_tnl *t;

1377
	dev->netdev_ops = &ip6_tnl_netdev_ops;
E
Eric Dumazet 已提交
1378
	dev->destructor = ip6_dev_free;
L
Linus Torvalds 已提交
1379 1380 1381 1382

	dev->type = ARPHRD_TUNNEL6;
	dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
	dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
1383 1384 1385
	t = netdev_priv(dev);
	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		dev->mtu-=8;
L
Linus Torvalds 已提交
1386 1387
	dev->flags |= IFF_NOARP;
	dev->addr_len = sizeof(struct in6_addr);
1388
	dev->features |= NETIF_F_NETNS_LOCAL;
1389
	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
L
Linus Torvalds 已提交
1390 1391 1392 1393
}


/**
1394
 * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
L
Linus Torvalds 已提交
1395 1396 1397
 *   @dev: virtual device associated with tunnel
 **/

E
Eric Dumazet 已提交
1398
static inline int
1399
ip6_tnl_dev_init_gen(struct net_device *dev)
L
Linus Torvalds 已提交
1400
{
1401
	struct ip6_tnl *t = netdev_priv(dev);
E
Eric Dumazet 已提交
1402

L
Linus Torvalds 已提交
1403 1404
	t->dev = dev;
	strcpy(t->parms.name, dev->name);
E
Eric Dumazet 已提交
1405 1406 1407 1408
	dev->tstats = alloc_percpu(struct pcpu_tstats);
	if (!dev->tstats)
		return -ENOMEM;
	return 0;
L
Linus Torvalds 已提交
1409 1410 1411
}

/**
1412
 * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
L
Linus Torvalds 已提交
1413 1414 1415
 *   @dev: virtual device associated with tunnel
 **/

E
Eric Dumazet 已提交
1416
static int ip6_tnl_dev_init(struct net_device *dev)
L
Linus Torvalds 已提交
1417
{
1418
	struct ip6_tnl *t = netdev_priv(dev);
E
Eric Dumazet 已提交
1419 1420 1421 1422
	int err = ip6_tnl_dev_init_gen(dev);

	if (err)
		return err;
1423
	ip6_tnl_link_config(t);
E
Eric Dumazet 已提交
1424
	return 0;
L
Linus Torvalds 已提交
1425 1426 1427
}

/**
1428
 * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
L
Linus Torvalds 已提交
1429 1430 1431 1432 1433
 *   @dev: fallback device
 *
 * Return: 0
 **/

E
Eric Dumazet 已提交
1434
static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
L
Linus Torvalds 已提交
1435
{
1436
	struct ip6_tnl *t = netdev_priv(dev);
1437 1438
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
E
Eric Dumazet 已提交
1439 1440 1441 1442
	int err = ip6_tnl_dev_init_gen(dev);

	if (err)
		return err;
1443

1444
	t->parms.proto = IPPROTO_IPV6;
L
Linus Torvalds 已提交
1445
	dev_hold(dev);
E
Eric Dumazet 已提交
1446
	rcu_assign_pointer(ip6n->tnls_wc[0], t);
E
Eric Dumazet 已提交
1447
	return 0;
L
Linus Torvalds 已提交
1448 1449
}

1450
static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
1451 1452 1453 1454 1455
	.handler	= ip4ip6_rcv,
	.err_handler	= ip4ip6_err,
	.priority	=	1,
};

1456
static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
1457 1458
	.handler	= ip6ip6_rcv,
	.err_handler	= ip6ip6_err,
H
Herbert Xu 已提交
1459
	.priority	=	1,
L
Linus Torvalds 已提交
1460 1461
};

1462
static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
1463 1464 1465
{
	int h;
	struct ip6_tnl *t;
1466
	LIST_HEAD(list);
1467 1468

	for (h = 0; h < HASH_SIZE; h++) {
E
Eric Dumazet 已提交
1469
		t = rtnl_dereference(ip6n->tnls_r_l[h]);
1470 1471
		while (t != NULL) {
			unregister_netdevice_queue(t->dev, &list);
E
Eric Dumazet 已提交
1472
			t = rtnl_dereference(t->next);
1473
		}
1474 1475
	}

E
Eric Dumazet 已提交
1476
	t = rtnl_dereference(ip6n->tnls_wc[0]);
1477 1478
	unregister_netdevice_queue(t->dev, &list);
	unregister_netdevice_many(&list);
1479 1480
}

1481
static int __net_init ip6_tnl_init_net(struct net *net)
1482
{
1483
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1484 1485
	int err;

1486 1487 1488
	ip6n->tnls[0] = ip6n->tnls_wc;
	ip6n->tnls[1] = ip6n->tnls_r_l;

1489 1490 1491 1492 1493 1494
	err = -ENOMEM;
	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
				      ip6_tnl_dev_setup);

	if (!ip6n->fb_tnl_dev)
		goto err_alloc_dev;
1495
	dev_net_set(ip6n->fb_tnl_dev, net);
1496

E
Eric Dumazet 已提交
1497 1498 1499
	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
	if (err < 0)
		goto err_register;
1500 1501 1502 1503

	err = register_netdev(ip6n->fb_tnl_dev);
	if (err < 0)
		goto err_register;
1504 1505
	return 0;

1506
err_register:
E
Eric Dumazet 已提交
1507
	ip6_dev_free(ip6n->fb_tnl_dev);
1508
err_alloc_dev:
1509 1510 1511
	return err;
}

1512
static void __net_exit ip6_tnl_exit_net(struct net *net)
1513
{
1514
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1515

1516 1517 1518
	rtnl_lock();
	ip6_tnl_destroy_tunnels(ip6n);
	rtnl_unlock();
1519 1520 1521 1522 1523
}

static struct pernet_operations ip6_tnl_net_ops = {
	.init = ip6_tnl_init_net,
	.exit = ip6_tnl_exit_net,
1524 1525
	.id   = &ip6_tnl_net_id,
	.size = sizeof(struct ip6_tnl_net),
1526 1527
};

L
Linus Torvalds 已提交
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537
/**
 * ip6_tunnel_init - register protocol and reserve needed resources
 *
 * Return: 0 on success
 **/

static int __init ip6_tunnel_init(void)
{
	int  err;

1538 1539 1540 1541 1542 1543
	err = register_pernet_device(&ip6_tnl_net_ops);
	if (err < 0)
		goto out_pernet;

	err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
	if (err < 0) {
1544
		printk(KERN_ERR "ip6_tunnel init: can't register ip4ip6\n");
1545
		goto out_ip4ip6;
1546 1547
	}

1548 1549
	err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
	if (err < 0) {
1550
		printk(KERN_ERR "ip6_tunnel init: can't register ip6ip6\n");
1551
		goto out_ip6ip6;
L
Linus Torvalds 已提交
1552
	}
1553

L
Linus Torvalds 已提交
1554
	return 0;
1555 1556

out_ip6ip6:
1557
	xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
1558 1559 1560
out_ip4ip6:
	unregister_pernet_device(&ip6_tnl_net_ops);
out_pernet:
L
Linus Torvalds 已提交
1561 1562 1563 1564 1565 1566 1567 1568 1569
	return err;
}

/**
 * ip6_tunnel_cleanup - free resources and unregister protocol
 **/

static void __exit ip6_tunnel_cleanup(void)
{
1570
	if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
1571
		printk(KERN_INFO "ip6_tunnel close: can't deregister ip4ip6\n");
1572

1573
	if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
1574
		printk(KERN_INFO "ip6_tunnel close: can't deregister ip6ip6\n");
L
Linus Torvalds 已提交
1575

1576
	unregister_pernet_device(&ip6_tnl_net_ops);
L
Linus Torvalds 已提交
1577 1578 1579 1580
}

module_init(ip6_tunnel_init);
module_exit(ip6_tunnel_cleanup);