ip6_tunnel.c 37.2 KB
Newer Older
L
Linus Torvalds 已提交
1
/*
2
 *	IPv6 tunneling device
L
Linus Torvalds 已提交
3 4 5
 *	Linux INET6 implementation
 *
 *	Authors:
6
 *	Ville Nuorvala		<vnuorval@tcs.hut.fi>
7
 *	Yasuyuki Kozakai	<kozakai@linux-ipv6.org>
L
Linus Torvalds 已提交
8 9
 *
 *      Based on:
10
 *      linux/net/ipv6/sit.c and linux/net/ipv4/ipip.c
L
Linus Torvalds 已提交
11 12 13 14 15 16 17 18 19 20
 *
 *      RFC 2473
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 *
 */

21 22
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
23
#include <linux/module.h>
24
#include <linux/capability.h>
L
Linus Torvalds 已提交
25 26 27
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/sockios.h>
28
#include <linux/icmp.h>
L
Linus Torvalds 已提交
29 30 31 32 33 34 35 36 37 38 39 40 41
#include <linux/if.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/if_tunnel.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmpv6.h>
#include <linux/init.h>
#include <linux/route.h>
#include <linux/rtnetlink.h>
#include <linux/netfilter_ipv6.h>
42
#include <linux/slab.h>
L
Linus Torvalds 已提交
43 44

#include <asm/uaccess.h>
A
Arun Sharma 已提交
45
#include <linux/atomic.h>
L
Linus Torvalds 已提交
46

47
#include <net/icmp.h>
L
Linus Torvalds 已提交
48 49 50 51 52 53 54 55
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/addrconf.h>
#include <net/ip6_tunnel.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
56 57
#include <net/net_namespace.h>
#include <net/netns/generic.h>
L
Linus Torvalds 已提交
58 59

MODULE_AUTHOR("Ville Nuorvala");
60
MODULE_DESCRIPTION("IPv6 tunneling device");
L
Linus Torvalds 已提交
61
MODULE_LICENSE("GPL");
S
stephen hemminger 已提交
62
MODULE_ALIAS_NETDEV("ip6tnl0");
L
Linus Torvalds 已提交
63 64

#ifdef IP6_TNL_DEBUG
65
#define IP6_TNL_TRACE(x...) pr_debug("%s:" x "\n", __func__)
L
Linus Torvalds 已提交
66 67 68 69 70
#else
#define IP6_TNL_TRACE(x...) do {;} while(0)
#endif

#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK)
71
#define IPV6_TCLASS_SHIFT 20
L
Linus Torvalds 已提交
72 73 74

#define HASH_SIZE  32

A
Al Viro 已提交
75
#define HASH(addr) ((__force u32)((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \
76 77
		     (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \
		    (HASH_SIZE - 1))
L
Linus Torvalds 已提交
78

E
Eric Dumazet 已提交
79
static int ip6_tnl_dev_init(struct net_device *dev);
80
static void ip6_tnl_dev_setup(struct net_device *dev);
L
Linus Torvalds 已提交
81

82
static int ip6_tnl_net_id __read_mostly;
83
struct ip6_tnl_net {
84 85
	/* the IPv6 tunnel fallback device */
	struct net_device *fb_tnl_dev;
86
	/* lists for storing tunnels in use */
E
Eric Dumazet 已提交
87 88 89
	struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE];
	struct ip6_tnl __rcu *tnls_wc[1];
	struct ip6_tnl __rcu **tnls[2];
90 91
};

E
Eric Dumazet 已提交
92 93 94 95 96 97
/* often modified stats are per cpu, other are shared (netdev->stats) */
struct pcpu_tstats {
	unsigned long	rx_packets;
	unsigned long	rx_bytes;
	unsigned long	tx_packets;
	unsigned long	tx_bytes;
E
Eric Dumazet 已提交
98
} __attribute__((aligned(4*sizeof(unsigned long))));
E
Eric Dumazet 已提交
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119

static struct net_device_stats *ip6_get_stats(struct net_device *dev)
{
	struct pcpu_tstats sum = { 0 };
	int i;

	for_each_possible_cpu(i) {
		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);

		sum.rx_packets += tstats->rx_packets;
		sum.rx_bytes   += tstats->rx_bytes;
		sum.tx_packets += tstats->tx_packets;
		sum.tx_bytes   += tstats->tx_bytes;
	}
	dev->stats.rx_packets = sum.rx_packets;
	dev->stats.rx_bytes   = sum.rx_bytes;
	dev->stats.tx_packets = sum.tx_packets;
	dev->stats.tx_bytes   = sum.tx_bytes;
	return &dev->stats;
}

120
/*
E
Eric Dumazet 已提交
121
 * Locking : hash tables are protected by RCU and RTNL
122
 */
L
Linus Torvalds 已提交
123 124 125 126 127

static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t)
{
	struct dst_entry *dst = t->dst_cache;

128
	if (dst && dst->obsolete &&
L
Linus Torvalds 已提交
129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
	    dst->ops->check(dst, t->dst_cookie) == NULL) {
		t->dst_cache = NULL;
		dst_release(dst);
		return NULL;
	}

	return dst;
}

static inline void ip6_tnl_dst_reset(struct ip6_tnl *t)
{
	dst_release(t->dst_cache);
	t->dst_cache = NULL;
}

static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;
	t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0;
	dst_release(t->dst_cache);
	t->dst_cache = dst;
}

/**
153
 * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
154 155
 *   @remote: the address of the tunnel exit-point
 *   @local: the address of the tunnel entry-point
L
Linus Torvalds 已提交
156
 *
157
 * Return:
L
Linus Torvalds 已提交
158
 *   tunnel matching given end-points if found,
159
 *   else fallback tunnel if its device is up,
L
Linus Torvalds 已提交
160 161 162
 *   else %NULL
 **/

163 164 165
#define for_each_ip6_tunnel_rcu(start) \
	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))

L
Linus Torvalds 已提交
166
static struct ip6_tnl *
167
ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_addr *local)
L
Linus Torvalds 已提交
168
{
E
Eric Dumazet 已提交
169 170
	unsigned int h0 = HASH(remote);
	unsigned int h1 = HASH(local);
L
Linus Torvalds 已提交
171
	struct ip6_tnl *t;
172
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
173

174
	for_each_ip6_tunnel_rcu(ip6n->tnls_r_l[h0 ^ h1]) {
L
Linus Torvalds 已提交
175 176 177 178 179
		if (ipv6_addr_equal(local, &t->parms.laddr) &&
		    ipv6_addr_equal(remote, &t->parms.raddr) &&
		    (t->dev->flags & IFF_UP))
			return t;
	}
180 181
	t = rcu_dereference(ip6n->tnls_wc[0]);
	if (t && (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
182 183 184 185 186 187
		return t;

	return NULL;
}

/**
188
 * ip6_tnl_bucket - get head of list matching given tunnel parameters
189
 *   @p: parameters containing tunnel end-points
L
Linus Torvalds 已提交
190 191
 *
 * Description:
192
 *   ip6_tnl_bucket() returns the head of the list matching the
L
Linus Torvalds 已提交
193 194
 *   &struct in6_addr entries laddr and raddr in @p.
 *
195
 * Return: head of IPv6 tunnel list
L
Linus Torvalds 已提交
196 197
 **/

E
Eric Dumazet 已提交
198
static struct ip6_tnl __rcu **
199
ip6_tnl_bucket(struct ip6_tnl_net *ip6n, const struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
200
{
201 202
	const struct in6_addr *remote = &p->raddr;
	const struct in6_addr *local = &p->laddr;
203
	unsigned int h = 0;
L
Linus Torvalds 已提交
204 205 206 207 208 209
	int prio = 0;

	if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) {
		prio = 1;
		h = HASH(remote) ^ HASH(local);
	}
210
	return &ip6n->tnls[prio][h];
L
Linus Torvalds 已提交
211 212 213
}

/**
214
 * ip6_tnl_link - add tunnel to hash table
L
Linus Torvalds 已提交
215 216 217 218
 *   @t: tunnel to be added
 **/

static void
219
ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
L
Linus Torvalds 已提交
220
{
E
Eric Dumazet 已提交
221
	struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms);
L
Linus Torvalds 已提交
222

223 224
	rcu_assign_pointer(t->next , rtnl_dereference(*tp));
	rcu_assign_pointer(*tp, t);
L
Linus Torvalds 已提交
225 226 227
}

/**
228
 * ip6_tnl_unlink - remove tunnel from hash table
L
Linus Torvalds 已提交
229 230 231 232
 *   @t: tunnel to be removed
 **/

static void
233
ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
L
Linus Torvalds 已提交
234
{
E
Eric Dumazet 已提交
235 236 237 238 239 240 241
	struct ip6_tnl __rcu **tp;
	struct ip6_tnl *iter;

	for (tp = ip6_tnl_bucket(ip6n, &t->parms);
	     (iter = rtnl_dereference(*tp)) != NULL;
	     tp = &iter->next) {
		if (t == iter) {
242
			rcu_assign_pointer(*tp, t->next);
L
Linus Torvalds 已提交
243 244 245 246 247
			break;
		}
	}
}

E
Eric Dumazet 已提交
248 249 250 251 252 253
static void ip6_dev_free(struct net_device *dev)
{
	free_percpu(dev->tstats);
	free_netdev(dev);
}

L
Linus Torvalds 已提交
254
/**
255
 * ip6_tnl_create - create a new tunnel
L
Linus Torvalds 已提交
256 257 258 259 260
 *   @p: tunnel parameters
 *   @pt: pointer to new tunnel
 *
 * Description:
 *   Create tunnel matching given parameters.
261 262
 *
 * Return:
263
 *   created tunnel or NULL
L
Linus Torvalds 已提交
264 265
 **/

266
static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
267 268 269 270 271
{
	struct net_device *dev;
	struct ip6_tnl *t;
	char name[IFNAMSIZ];
	int err;
272
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
273

274
	if (p->name[0])
L
Linus Torvalds 已提交
275
		strlcpy(name, p->name, IFNAMSIZ);
276 277 278
	else
		sprintf(name, "ip6tnl%%d");

279
	dev = alloc_netdev(sizeof (*t), name, ip6_tnl_dev_setup);
L
Linus Torvalds 已提交
280
	if (dev == NULL)
281
		goto failed;
L
Linus Torvalds 已提交
282

283 284
	dev_net_set(dev, net);

285
	t = netdev_priv(dev);
L
Linus Torvalds 已提交
286
	t->parms = *p;
E
Eric Dumazet 已提交
287 288 289
	err = ip6_tnl_dev_init(dev);
	if (err < 0)
		goto failed_free;
L
Linus Torvalds 已提交
290

291 292 293
	if ((err = register_netdevice(dev)) < 0)
		goto failed_free;

294 295
	strcpy(t->parms.name, dev->name);

L
Linus Torvalds 已提交
296
	dev_hold(dev);
297
	ip6_tnl_link(ip6n, t);
298
	return t;
299 300

failed_free:
E
Eric Dumazet 已提交
301
	ip6_dev_free(dev);
302 303
failed:
	return NULL;
L
Linus Torvalds 已提交
304 305 306
}

/**
307
 * ip6_tnl_locate - find or create tunnel matching given parameters
308
 *   @p: tunnel parameters
L
Linus Torvalds 已提交
309 310 311
 *   @create: != 0 if allowed to create new tunnel if no match found
 *
 * Description:
312
 *   ip6_tnl_locate() first tries to locate an existing tunnel
L
Linus Torvalds 已提交
313 314 315 316
 *   based on @parms. If this is unsuccessful, but @create is set a new
 *   tunnel device is created and registered for use.
 *
 * Return:
317
 *   matching tunnel or NULL
L
Linus Torvalds 已提交
318 319
 **/

320 321
static struct ip6_tnl *ip6_tnl_locate(struct net *net,
		struct ip6_tnl_parm *p, int create)
L
Linus Torvalds 已提交
322
{
323 324
	const struct in6_addr *remote = &p->raddr;
	const struct in6_addr *local = &p->laddr;
E
Eric Dumazet 已提交
325
	struct ip6_tnl __rcu **tp;
L
Linus Torvalds 已提交
326
	struct ip6_tnl *t;
327
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
328

E
Eric Dumazet 已提交
329 330 331
	for (tp = ip6_tnl_bucket(ip6n, p);
	     (t = rtnl_dereference(*tp)) != NULL;
	     tp = &t->next) {
L
Linus Torvalds 已提交
332
		if (ipv6_addr_equal(local, &t->parms.laddr) &&
333 334
		    ipv6_addr_equal(remote, &t->parms.raddr))
			return t;
L
Linus Torvalds 已提交
335 336
	}
	if (!create)
337
		return NULL;
338
	return ip6_tnl_create(net, p);
L
Linus Torvalds 已提交
339 340 341
}

/**
342
 * ip6_tnl_dev_uninit - tunnel device uninitializer
L
Linus Torvalds 已提交
343
 *   @dev: the device to be destroyed
344
 *
L
Linus Torvalds 已提交
345
 * Description:
346
 *   ip6_tnl_dev_uninit() removes tunnel from its list
L
Linus Torvalds 已提交
347 348 349
 **/

static void
350
ip6_tnl_dev_uninit(struct net_device *dev)
L
Linus Torvalds 已提交
351
{
352
	struct ip6_tnl *t = netdev_priv(dev);
353 354
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
355

E
Eric Dumazet 已提交
356
	if (dev == ip6n->fb_tnl_dev)
357
		RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
E
Eric Dumazet 已提交
358
	else
359
		ip6_tnl_unlink(ip6n, t);
L
Linus Torvalds 已提交
360 361 362 363 364 365 366 367
	ip6_tnl_dst_reset(t);
	dev_put(dev);
}

/**
 * parse_tvl_tnl_enc_lim - handle encapsulation limit option
 *   @skb: received socket buffer
 *
368 369
 * Return:
 *   0 if none was found,
L
Linus Torvalds 已提交
370 371 372 373 374 375
 *   else index to encapsulation limit
 **/

static __u16
parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw)
{
376
	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) raw;
L
Linus Torvalds 已提交
377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
	__u8 nexthdr = ipv6h->nexthdr;
	__u16 off = sizeof (*ipv6h);

	while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) {
		__u16 optlen = 0;
		struct ipv6_opt_hdr *hdr;
		if (raw + off + sizeof (*hdr) > skb->data &&
		    !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr)))
			break;

		hdr = (struct ipv6_opt_hdr *) (raw + off);
		if (nexthdr == NEXTHDR_FRAGMENT) {
			struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr;
			if (frag_hdr->frag_off)
				break;
			optlen = 8;
		} else if (nexthdr == NEXTHDR_AUTH) {
			optlen = (hdr->hdrlen + 2) << 2;
		} else {
			optlen = ipv6_optlen(hdr);
		}
		if (nexthdr == NEXTHDR_DEST) {
			__u16 i = off + 2;
			while (1) {
				struct ipv6_tlv_tnl_enc_lim *tel;

				/* No more room for encapsulation limit */
				if (i + sizeof (*tel) > off + optlen)
					break;

				tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i];
				/* return index of option if found and valid */
				if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT &&
				    tel->length == 1)
					return i;
				/* else jump to next option */
				if (tel->type)
					i += tel->length + 2;
				else
					i++;
			}
		}
		nexthdr = hdr->nexthdr;
		off += optlen;
	}
	return 0;
}

/**
426
 * ip6_tnl_err - tunnel error handler
L
Linus Torvalds 已提交
427 428
 *
 * Description:
429
 *   ip6_tnl_err() should handle errors in the tunnel according
L
Linus Torvalds 已提交
430 431 432
 *   to the specifications in RFC 2473.
 **/

H
Herbert Xu 已提交
433
static int
434
ip6_tnl_err(struct sk_buff *skb, __u8 ipproto, struct inet6_skb_parm *opt,
435
	    u8 *type, u8 *code, int *msg, __u32 *info, int offset)
L
Linus Torvalds 已提交
436
{
437
	const struct ipv6hdr *ipv6h = (const struct ipv6hdr *) skb->data;
L
Linus Torvalds 已提交
438 439
	struct ip6_tnl *t;
	int rel_msg = 0;
440 441
	u8 rel_type = ICMPV6_DEST_UNREACH;
	u8 rel_code = ICMPV6_ADDR_UNREACH;
L
Linus Torvalds 已提交
442 443
	__u32 rel_info = 0;
	__u16 len;
H
Herbert Xu 已提交
444
	int err = -ENOENT;
L
Linus Torvalds 已提交
445

446 447
	/* If the packet doesn't contain the original IPv6 header we are
	   in trouble since we might need the source address for further
L
Linus Torvalds 已提交
448 449
	   processing of the error. */

450
	rcu_read_lock();
451
	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->daddr,
452
					&ipv6h->saddr)) == NULL)
L
Linus Torvalds 已提交
453 454
		goto out;

455 456 457
	if (t->parms.proto != ipproto && t->parms.proto != 0)
		goto out;

H
Herbert Xu 已提交
458 459
	err = 0;

460
	switch (*type) {
L
Linus Torvalds 已提交
461 462 463 464
		__u32 teli;
		struct ipv6_tlv_tnl_enc_lim *tel;
		__u32 mtu;
	case ICMPV6_DEST_UNREACH:
465 466
		net_warn_ratelimited("%s: Path to destination invalid or inactive!\n",
				     t->parms.name);
L
Linus Torvalds 已提交
467 468 469
		rel_msg = 1;
		break;
	case ICMPV6_TIME_EXCEED:
470
		if ((*code) == ICMPV6_EXC_HOPLIMIT) {
471 472
			net_warn_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
					     t->parms.name);
L
Linus Torvalds 已提交
473 474 475 476
			rel_msg = 1;
		}
		break;
	case ICMPV6_PARAMPROB:
477
		teli = 0;
478
		if ((*code) == ICMPV6_HDR_FIELD)
479
			teli = parse_tlv_tnl_enc_lim(skb, skb->data);
L
Linus Torvalds 已提交
480

A
Al Viro 已提交
481
		if (teli && teli == *info - 2) {
L
Linus Torvalds 已提交
482 483
			tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli];
			if (tel->encap_limit == 0) {
484 485
				net_warn_ratelimited("%s: Too small encapsulation limit or routing loop in tunnel!\n",
						     t->parms.name);
L
Linus Torvalds 已提交
486 487
				rel_msg = 1;
			}
488 489 490
		} else {
			net_warn_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
					     t->parms.name);
L
Linus Torvalds 已提交
491 492 493
		}
		break;
	case ICMPV6_PKT_TOOBIG:
A
Al Viro 已提交
494
		mtu = *info - offset;
L
Linus Torvalds 已提交
495 496 497 498
		if (mtu < IPV6_MIN_MTU)
			mtu = IPV6_MIN_MTU;
		t->dev->mtu = mtu;

A
Al Viro 已提交
499
		if ((len = sizeof (*ipv6h) + ntohs(ipv6h->payload_len)) > mtu) {
L
Linus Torvalds 已提交
500 501 502 503 504 505 506
			rel_type = ICMPV6_PKT_TOOBIG;
			rel_code = 0;
			rel_info = mtu;
			rel_msg = 1;
		}
		break;
	}
507 508 509 510 511 512 513

	*type = rel_type;
	*code = rel_code;
	*info = rel_info;
	*msg = rel_msg;

out:
514
	rcu_read_unlock();
515 516 517
	return err;
}

518 519
static int
ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
520
	   u8 type, u8 code, int offset, __be32 info)
521 522
{
	int rel_msg = 0;
523 524
	u8 rel_type = type;
	u8 rel_code = code;
A
Al Viro 已提交
525
	__u32 rel_info = ntohl(info);
526 527
	int err;
	struct sk_buff *skb2;
528
	const struct iphdr *eiph;
529
	struct rtable *rt;
530
	struct flowi4 fl4;
531

532 533
	err = ip6_tnl_err(skb, IPPROTO_IPIP, opt, &rel_type, &rel_code,
			  &rel_msg, &rel_info, offset);
534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
	if (err < 0)
		return err;

	if (rel_msg == 0)
		return 0;

	switch (rel_type) {
	case ICMPV6_DEST_UNREACH:
		if (rel_code != ICMPV6_ADDR_UNREACH)
			return 0;
		rel_type = ICMP_DEST_UNREACH;
		rel_code = ICMP_HOST_UNREACH;
		break;
	case ICMPV6_PKT_TOOBIG:
		if (rel_code != 0)
			return 0;
		rel_type = ICMP_DEST_UNREACH;
		rel_code = ICMP_FRAG_NEEDED;
		break;
	default:
		return 0;
	}

	if (!pskb_may_pull(skb, offset + sizeof(struct iphdr)))
		return 0;

	skb2 = skb_clone(skb, GFP_ATOMIC);
	if (!skb2)
		return 0;

E
Eric Dumazet 已提交
564 565
	skb_dst_drop(skb2);

566
	skb_pull(skb2, offset);
567
	skb_reset_network_header(skb2);
568
	eiph = ip_hdr(skb2);
569 570

	/* Try to guess incoming interface */
571
	rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
572 573 574
				   eiph->saddr, 0,
				   0, 0,
				   IPPROTO_IPIP, RT_TOS(eiph->tos), 0);
575
	if (IS_ERR(rt))
576 577
		goto out;

578
	skb2->dev = rt->dst.dev;
579 580 581 582 583

	/* route "incoming" packet */
	if (rt->rt_flags & RTCF_LOCAL) {
		ip_rt_put(rt);
		rt = NULL;
584
		rt = ip_route_output_ports(dev_net(skb->dev), &fl4, NULL,
585 586 587 588
					   eiph->daddr, eiph->saddr,
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(eiph->tos), 0);
589
		if (IS_ERR(rt) ||
590
		    rt->dst.dev->type != ARPHRD_TUNNEL) {
591 592
			if (!IS_ERR(rt))
				ip_rt_put(rt);
593 594
			goto out;
		}
595
		skb_dst_set(skb2, &rt->dst);
596 597 598 599
	} else {
		ip_rt_put(rt);
		if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos,
				   skb2->dev) ||
E
Eric Dumazet 已提交
600
		    skb_dst(skb2)->dev->type != ARPHRD_TUNNEL)
601 602 603 604 605
			goto out;
	}

	/* change mtu on this route */
	if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
E
Eric Dumazet 已提交
606
		if (rel_info > dst_mtu(skb_dst(skb2)))
607 608
			goto out;

E
Eric Dumazet 已提交
609
		skb_dst(skb2)->ops->update_pmtu(skb_dst(skb2), rel_info);
610 611
	}

A
Al Viro 已提交
612
	icmp_send(skb2, rel_type, rel_code, htonl(rel_info));
613 614 615 616 617 618

out:
	kfree_skb(skb2);
	return 0;
}

619 620
static int
ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
621
	   u8 type, u8 code, int offset, __be32 info)
622 623
{
	int rel_msg = 0;
624 625
	u8 rel_type = type;
	u8 rel_code = code;
A
Al Viro 已提交
626
	__u32 rel_info = ntohl(info);
627 628
	int err;

629 630
	err = ip6_tnl_err(skb, IPPROTO_IPV6, opt, &rel_type, &rel_code,
			  &rel_msg, &rel_info, offset);
631 632 633 634
	if (err < 0)
		return err;

	if (rel_msg && pskb_may_pull(skb, offset + sizeof(struct ipv6hdr))) {
L
Linus Torvalds 已提交
635 636
		struct rt6_info *rt;
		struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
637

L
Linus Torvalds 已提交
638
		if (!skb2)
639
			return 0;
L
Linus Torvalds 已提交
640

E
Eric Dumazet 已提交
641
		skb_dst_drop(skb2);
L
Linus Torvalds 已提交
642
		skb_pull(skb2, offset);
643
		skb_reset_network_header(skb2);
L
Linus Torvalds 已提交
644 645

		/* Try to guess incoming interface */
646 647
		rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr,
				NULL, 0, 0);
L
Linus Torvalds 已提交
648

649 650
		if (rt && rt->dst.dev)
			skb2->dev = rt->dst.dev;
L
Linus Torvalds 已提交
651

652
		icmpv6_send(skb2, rel_type, rel_code, rel_info);
L
Linus Torvalds 已提交
653 654

		if (rt)
655
			dst_release(&rt->dst);
L
Linus Torvalds 已提交
656 657 658

		kfree_skb(skb2);
	}
659 660

	return 0;
L
Linus Torvalds 已提交
661 662
}

663 664
static void ip4ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
					const struct ipv6hdr *ipv6h,
665 666 667 668 669
					struct sk_buff *skb)
{
	__u8 dsfield = ipv6_get_dsfield(ipv6h) & ~INET_ECN_MASK;

	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
670
		ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, dsfield);
671 672

	if (INET_ECN_is_ce(dsfield))
673
		IP_ECN_set_ce(ip_hdr(skb));
674 675
}

676 677
static void ip6ip6_dscp_ecn_decapsulate(const struct ip6_tnl *t,
					const struct ipv6hdr *ipv6h,
678
					struct sk_buff *skb)
L
Linus Torvalds 已提交
679
{
680
	if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY)
681
		ipv6_copy_dscp(ipv6_get_dsfield(ipv6h), ipv6_hdr(skb));
L
Linus Torvalds 已提交
682

683
	if (INET_ECN_is_ce(ipv6_get_dsfield(ipv6h)))
684
		IP6_ECN_set_ce(ipv6_hdr(skb));
L
Linus Torvalds 已提交
685
}
686

687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709
static __u32 ip6_tnl_get_cap(struct ip6_tnl *t,
			     const struct in6_addr *laddr,
			     const struct in6_addr *raddr)
{
	struct ip6_tnl_parm *p = &t->parms;
	int ltype = ipv6_addr_type(laddr);
	int rtype = ipv6_addr_type(raddr);
	__u32 flags = 0;

	if (ltype == IPV6_ADDR_ANY || rtype == IPV6_ADDR_ANY) {
		flags = IP6_TNL_F_CAP_PER_PACKET;
	} else if (ltype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
		   rtype & (IPV6_ADDR_UNICAST|IPV6_ADDR_MULTICAST) &&
		   !((ltype|rtype) & IPV6_ADDR_LOOPBACK) &&
		   (!((ltype|rtype) & IPV6_ADDR_LINKLOCAL) || p->link)) {
		if (ltype&IPV6_ADDR_UNICAST)
			flags |= IP6_TNL_F_CAP_XMIT;
		if (rtype&IPV6_ADDR_UNICAST)
			flags |= IP6_TNL_F_CAP_RCV;
	}
	return flags;
}

E
Eric Dumazet 已提交
710
/* called with rcu_read_lock() */
711 712 713
static inline int ip6_tnl_rcv_ctl(struct ip6_tnl *t,
				  const struct in6_addr *laddr,
				  const struct in6_addr *raddr)
714 715 716
{
	struct ip6_tnl_parm *p = &t->parms;
	int ret = 0;
717
	struct net *net = dev_net(t->dev);
718

719 720 721
	if ((p->flags & IP6_TNL_F_CAP_RCV) ||
	    ((p->flags & IP6_TNL_F_CAP_PER_PACKET) &&
	     (ip6_tnl_get_cap(t, laddr, raddr) & IP6_TNL_F_CAP_RCV))) {
722
		struct net_device *ldev = NULL;
723 724

		if (p->link)
E
Eric Dumazet 已提交
725
			ldev = dev_get_by_index_rcu(net, p->link);
726

727 728 729
		if ((ipv6_addr_is_multicast(laddr) ||
		     likely(ipv6_chk_addr(net, laddr, ldev, 0))) &&
		    likely(!ipv6_chk_addr(net, raddr, NULL, 0)))
730 731 732 733
			ret = 1;
	}
	return ret;
}
L
Linus Torvalds 已提交
734 735

/**
736
 * ip6_tnl_rcv - decapsulate IPv6 packet and retransmit it locally
L
Linus Torvalds 已提交
737
 *   @skb: received socket buffer
738 739
 *   @protocol: ethernet protocol ID
 *   @dscp_ecn_decapsulate: the function to decapsulate DSCP code and ECN
L
Linus Torvalds 已提交
740 741 742 743
 *
 * Return: 0
 **/

744
static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
745
		       __u8 ipproto,
746 747
		       void (*dscp_ecn_decapsulate)(const struct ip6_tnl *t,
						    const struct ipv6hdr *ipv6h,
748
						    struct sk_buff *skb))
L
Linus Torvalds 已提交
749 750
{
	struct ip6_tnl *t;
751
	const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
L
Linus Torvalds 已提交
752

753
	rcu_read_lock();
L
Linus Torvalds 已提交
754

755
	if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr,
756
					&ipv6h->daddr)) != NULL) {
E
Eric Dumazet 已提交
757 758
		struct pcpu_tstats *tstats;

759
		if (t->parms.proto != ipproto && t->parms.proto != 0) {
760
			rcu_read_unlock();
761 762 763
			goto discard;
		}

L
Linus Torvalds 已提交
764
		if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) {
765
			rcu_read_unlock();
766
			goto discard;
L
Linus Torvalds 已提交
767 768
		}

769
		if (!ip6_tnl_rcv_ctl(t, &ipv6h->daddr, &ipv6h->saddr)) {
770
			t->dev->stats.rx_dropped++;
771
			rcu_read_unlock();
L
Linus Torvalds 已提交
772 773 774
			goto discard;
		}
		secpath_reset(skb);
775
		skb->mac_header = skb->network_header;
776
		skb_reset_network_header(skb);
777
		skb->protocol = htons(protocol);
L
Linus Torvalds 已提交
778 779
		skb->pkt_type = PACKET_HOST;
		memset(skb->cb, 0, sizeof(struct inet6_skb_parm));
780

E
Eric Dumazet 已提交
781 782 783 784 785
		tstats = this_cpu_ptr(t->dev->tstats);
		tstats->rx_packets++;
		tstats->rx_bytes += skb->len;

		__skb_tunnel_rx(skb, t->dev);
786

787
		dscp_ecn_decapsulate(t, ipv6h, skb);
E
Eric Dumazet 已提交
788

789
		netif_rx(skb);
E
Eric Dumazet 已提交
790

791
		rcu_read_unlock();
L
Linus Torvalds 已提交
792 793
		return 0;
	}
794
	rcu_read_unlock();
L
Linus Torvalds 已提交
795
	return 1;
796 797 798 799

discard:
	kfree_skb(skb);
	return 0;
L
Linus Torvalds 已提交
800 801
}

802 803
static int ip4ip6_rcv(struct sk_buff *skb)
{
804 805
	return ip6_tnl_rcv(skb, ETH_P_IP, IPPROTO_IPIP,
			   ip4ip6_dscp_ecn_decapsulate);
806 807
}

808 809
static int ip6ip6_rcv(struct sk_buff *skb)
{
810 811
	return ip6_tnl_rcv(skb, ETH_P_IPV6, IPPROTO_IPV6,
			   ip6ip6_dscp_ecn_decapsulate);
812 813
}

814 815 816 817
struct ipv6_tel_txoption {
	struct ipv6_txoptions ops;
	__u8 dst_opt[8];
};
L
Linus Torvalds 已提交
818

819 820 821
static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit)
{
	memset(opt, 0, sizeof(struct ipv6_tel_txoption));
L
Linus Torvalds 已提交
822

823 824 825 826 827
	opt->dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT;
	opt->dst_opt[3] = 1;
	opt->dst_opt[4] = encap_limit;
	opt->dst_opt[5] = IPV6_TLV_PADN;
	opt->dst_opt[6] = 1;
L
Linus Torvalds 已提交
828

829 830
	opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt;
	opt->ops.opt_nflen = 8;
L
Linus Torvalds 已提交
831 832 833
}

/**
834
 * ip6_tnl_addr_conflict - compare packet addresses to tunnel's own
L
Linus Torvalds 已提交
835
 *   @t: the outgoing tunnel device
836
 *   @hdr: IPv6 header from the incoming packet
L
Linus Torvalds 已提交
837 838
 *
 * Description:
839
 *   Avoid trivial tunneling loop by checking that tunnel exit-point
L
Linus Torvalds 已提交
840 841
 *   doesn't match source of incoming packet.
 *
842
 * Return:
L
Linus Torvalds 已提交
843 844 845 846
 *   1 if conflict,
 *   0 else
 **/

E
Eric Dumazet 已提交
847
static inline bool
848
ip6_tnl_addr_conflict(const struct ip6_tnl *t, const struct ipv6hdr *hdr)
L
Linus Torvalds 已提交
849 850 851 852
{
	return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr);
}

853 854 855 856
static inline int ip6_tnl_xmit_ctl(struct ip6_tnl *t)
{
	struct ip6_tnl_parm *p = &t->parms;
	int ret = 0;
857
	struct net *net = dev_net(t->dev);
858

859
	if (p->flags & IP6_TNL_F_CAP_XMIT) {
860 861
		struct net_device *ldev = NULL;

E
Eric Dumazet 已提交
862
		rcu_read_lock();
863
		if (p->link)
E
Eric Dumazet 已提交
864
			ldev = dev_get_by_index_rcu(net, p->link);
865

866
		if (unlikely(!ipv6_chk_addr(net, &p->laddr, ldev, 0)))
867 868
			pr_warn("%s xmit: Local address not yet configured!\n",
				p->name);
869
		else if (!ipv6_addr_is_multicast(&p->raddr) &&
870
			 unlikely(ipv6_chk_addr(net, &p->raddr, NULL, 0)))
871 872
			pr_warn("%s xmit: Routing loop! Remote address found on this node!\n",
				p->name);
873 874
		else
			ret = 1;
E
Eric Dumazet 已提交
875
		rcu_read_unlock();
876 877 878
	}
	return ret;
}
L
Linus Torvalds 已提交
879
/**
880
 * ip6_tnl_xmit2 - encapsulate packet and send
L
Linus Torvalds 已提交
881
 *   @skb: the outgoing socket buffer
882
 *   @dev: the outgoing tunnel device
883 884 885 886
 *   @dsfield: dscp code for outer header
 *   @fl: flow of tunneled packet
 *   @encap_limit: encapsulation limit
 *   @pmtu: Path MTU is stored if packet is too big
L
Linus Torvalds 已提交
887 888 889 890 891
 *
 * Description:
 *   Build new header and do some sanity checks on the packet before sending
 *   it.
 *
892
 * Return:
893
 *   0 on success
894 895
 *   -1 fail
 *   %-EMSGSIZE message too big. return mtu in this case.
L
Linus Torvalds 已提交
896 897
 **/

898 899 900
static int ip6_tnl_xmit2(struct sk_buff *skb,
			 struct net_device *dev,
			 __u8 dsfield,
901
			 struct flowi6 *fl6,
902 903
			 int encap_limit,
			 __u32 *pmtu)
L
Linus Torvalds 已提交
904
{
A
Alexey Dobriyan 已提交
905
	struct net *net = dev_net(dev);
906
	struct ip6_tnl *t = netdev_priv(dev);
907
	struct net_device_stats *stats = &t->dev->stats;
908
	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
909
	struct ipv6_tel_txoption opt;
910
	struct dst_entry *dst = NULL, *ndst = NULL;
L
Linus Torvalds 已提交
911 912
	struct net_device *tdev;
	int mtu;
913
	unsigned int max_headroom = sizeof(struct ipv6hdr);
L
Linus Torvalds 已提交
914
	u8 proto;
915
	int err = -1;
L
Linus Torvalds 已提交
916 917
	int pkt_len;

918 919
	if (!fl6->flowi6_mark)
		dst = ip6_tnl_dst_check(t);
920 921
	if (!dst) {
		ndst = ip6_route_output(net, NULL, fl6);
L
Linus Torvalds 已提交
922

923
		if (ndst->error)
924
			goto tx_err_link_failure;
925 926 927 928
		ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0);
		if (IS_ERR(ndst)) {
			err = PTR_ERR(ndst);
			ndst = NULL;
929 930
			goto tx_err_link_failure;
		}
931
		dst = ndst;
932
	}
L
Linus Torvalds 已提交
933 934 935 936 937

	tdev = dst->dev;

	if (tdev == dev) {
		stats->collisions++;
938 939
		net_warn_ratelimited("%s: Local routing loop detected!\n",
				     t->parms.name);
L
Linus Torvalds 已提交
940 941 942
		goto tx_err_dst_release;
	}
	mtu = dst_mtu(dst) - sizeof (*ipv6h);
943
	if (encap_limit >= 0) {
L
Linus Torvalds 已提交
944 945 946 947 948
		max_headroom += 8;
		mtu -= 8;
	}
	if (mtu < IPV6_MIN_MTU)
		mtu = IPV6_MIN_MTU;
E
Eric Dumazet 已提交
949 950
	if (skb_dst(skb))
		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
L
Linus Torvalds 已提交
951
	if (skb->len > mtu) {
952 953
		*pmtu = mtu;
		err = -EMSGSIZE;
L
Linus Torvalds 已提交
954 955 956 957 958 959 960
		goto tx_err_dst_release;
	}

	/*
	 * Okay, now see if we can stuff it in the buffer as-is.
	 */
	max_headroom += LL_RESERVED_SPACE(tdev);
961

962 963
	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
L
Linus Torvalds 已提交
964
		struct sk_buff *new_skb;
965

L
Linus Torvalds 已提交
966 967 968 969 970
		if (!(new_skb = skb_realloc_headroom(skb, max_headroom)))
			goto tx_err_dst_release;

		if (skb->sk)
			skb_set_owner_w(new_skb, skb->sk);
971
		consume_skb(skb);
L
Linus Torvalds 已提交
972 973
		skb = new_skb;
	}
E
Eric Dumazet 已提交
974
	skb_dst_drop(skb);
975 976 977 978 979 980
	if (fl6->flowi6_mark) {
		skb_dst_set(skb, dst);
		ndst = NULL;
	} else {
		skb_dst_set_noref(skb, dst);
	}
981
	skb->transport_header = skb->network_header;
L
Linus Torvalds 已提交
982

983
	proto = fl6->flowi6_proto;
984 985 986 987
	if (encap_limit >= 0) {
		init_tel_txopt(&opt, encap_limit);
		ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL);
	}
988 989
	skb_push(skb, sizeof(struct ipv6hdr));
	skb_reset_network_header(skb);
990
	ipv6h = ipv6_hdr(skb);
991
	*(__be32*)ipv6h = fl6->flowlabel | htonl(0x60000000);
L
Linus Torvalds 已提交
992 993 994 995
	dsfield = INET_ECN_encapsulate(0, dsfield);
	ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield);
	ipv6h->hop_limit = t->parms.hop_limit;
	ipv6h->nexthdr = proto;
A
Alexey Dobriyan 已提交
996 997
	ipv6h->saddr = fl6->saddr;
	ipv6h->daddr = fl6->daddr;
L
Linus Torvalds 已提交
998 999
	nf_reset(skb);
	pkt_len = skb->len;
H
Herbert Xu 已提交
1000
	err = ip6_local_out(skb);
L
Linus Torvalds 已提交
1001

1002
	if (net_xmit_eval(err) == 0) {
E
Eric Dumazet 已提交
1003 1004 1005 1006
		struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats);

		tstats->tx_bytes += pkt_len;
		tstats->tx_packets++;
L
Linus Torvalds 已提交
1007 1008 1009 1010
	} else {
		stats->tx_errors++;
		stats->tx_aborted_errors++;
	}
1011 1012
	if (ndst)
		ip6_tnl_dst_store(t, ndst);
L
Linus Torvalds 已提交
1013 1014 1015 1016 1017
	return 0;
tx_err_link_failure:
	stats->tx_carrier_errors++;
	dst_link_failure(skb);
tx_err_dst_release:
1018
	dst_release(ndst);
1019 1020 1021
	return err;
}

1022 1023 1024 1025
static inline int
ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1026
	const struct iphdr  *iph = ip_hdr(skb);
1027
	int encap_limit = -1;
1028
	struct flowi6 fl6;
1029 1030 1031 1032
	__u8 dsfield;
	__u32 mtu;
	int err;

1033 1034
	if ((t->parms.proto != IPPROTO_IPIP && t->parms.proto != 0) ||
	    !ip6_tnl_xmit_ctl(t))
1035 1036 1037 1038 1039
		return -1;

	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		encap_limit = t->parms.encap_limit;

1040 1041
	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
	fl6.flowi6_proto = IPPROTO_IPIP;
1042 1043 1044

	dsfield = ipv4_get_dsfield(iph);

1045
	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
1046
		fl6.flowlabel |= htonl((__u32)iph->tos << IPV6_TCLASS_SHIFT)
A
Al Viro 已提交
1047
					  & IPV6_TCLASS_MASK;
1048 1049
	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
		fl6.flowi6_mark = skb->mark;
1050

1051
	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062
	if (err != 0) {
		/* XXX: send ICMP error even if DF is not set. */
		if (err == -EMSGSIZE)
			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
				  htonl(mtu));
		return -1;
	}

	return 0;
}

1063 1064 1065 1066
static inline int
ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1067
	struct ipv6hdr *ipv6h = ipv6_hdr(skb);
1068 1069
	int encap_limit = -1;
	__u16 offset;
1070
	struct flowi6 fl6;
1071 1072 1073 1074
	__u8 dsfield;
	__u32 mtu;
	int err;

1075 1076
	if ((t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) ||
	    !ip6_tnl_xmit_ctl(t) || ip6_tnl_addr_conflict(t, ipv6h))
1077 1078
		return -1;

1079 1080
	offset = parse_tlv_tnl_enc_lim(skb, skb_network_header(skb));
	if (offset > 0) {
1081
		struct ipv6_tlv_tnl_enc_lim *tel;
1082
		tel = (struct ipv6_tlv_tnl_enc_lim *)&skb_network_header(skb)[offset];
1083 1084
		if (tel->encap_limit == 0) {
			icmpv6_send(skb, ICMPV6_PARAMPROB,
1085
				    ICMPV6_HDR_FIELD, offset + 2);
1086 1087 1088 1089 1090 1091
			return -1;
		}
		encap_limit = tel->encap_limit - 1;
	} else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		encap_limit = t->parms.encap_limit;

1092 1093
	memcpy(&fl6, &t->fl.u.ip6, sizeof (fl6));
	fl6.flowi6_proto = IPPROTO_IPV6;
1094 1095

	dsfield = ipv6_get_dsfield(ipv6h);
1096
	if (t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)
1097
		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_TCLASS_MASK);
1098
	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)
1099
		fl6.flowlabel |= (*(__be32 *) ipv6h & IPV6_FLOWLABEL_MASK);
1100 1101
	if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK)
		fl6.flowi6_mark = skb->mark;
1102

1103
	err = ip6_tnl_xmit2(skb, dev, dsfield, &fl6, encap_limit, &mtu);
1104 1105
	if (err != 0) {
		if (err == -EMSGSIZE)
1106
			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1107 1108 1109 1110 1111 1112
		return -1;
	}

	return 0;
}

1113
static netdev_tx_t
1114 1115 1116
ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ip6_tnl *t = netdev_priv(dev);
1117
	struct net_device_stats *stats = &t->dev->stats;
1118 1119 1120
	int ret;

	switch (skb->protocol) {
1121
	case htons(ETH_P_IP):
1122 1123
		ret = ip4ip6_tnl_xmit(skb, dev);
		break;
1124
	case htons(ETH_P_IPV6):
1125 1126 1127 1128 1129 1130 1131 1132 1133
		ret = ip6ip6_tnl_xmit(skb, dev);
		break;
	default:
		goto tx_err;
	}

	if (ret < 0)
		goto tx_err;

1134
	return NETDEV_TX_OK;
1135

L
Linus Torvalds 已提交
1136 1137 1138 1139
tx_err:
	stats->tx_errors++;
	stats->tx_dropped++;
	kfree_skb(skb);
1140
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
1141 1142
}

1143
static void ip6_tnl_link_config(struct ip6_tnl *t)
L
Linus Torvalds 已提交
1144 1145 1146
{
	struct net_device *dev = t->dev;
	struct ip6_tnl_parm *p = &t->parms;
1147
	struct flowi6 *fl6 = &t->fl.u.ip6;
L
Linus Torvalds 已提交
1148

1149 1150
	memcpy(dev->dev_addr, &p->laddr, sizeof(struct in6_addr));
	memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr));
L
Linus Torvalds 已提交
1151 1152

	/* Set up flowi template */
A
Alexey Dobriyan 已提交
1153 1154
	fl6->saddr = p->laddr;
	fl6->daddr = p->raddr;
1155 1156
	fl6->flowi6_oif = p->link;
	fl6->flowlabel = 0;
L
Linus Torvalds 已提交
1157 1158

	if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS))
1159
		fl6->flowlabel |= IPV6_TCLASS_MASK & p->flowinfo;
L
Linus Torvalds 已提交
1160
	if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL))
1161
		fl6->flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo;
L
Linus Torvalds 已提交
1162

1163 1164
	p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV|IP6_TNL_F_CAP_PER_PACKET);
	p->flags |= ip6_tnl_get_cap(t, &p->laddr, &p->raddr);
L
Linus Torvalds 已提交
1165 1166 1167 1168 1169 1170 1171 1172 1173

	if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV)
		dev->flags |= IFF_POINTOPOINT;
	else
		dev->flags &= ~IFF_POINTOPOINT;

	dev->iflink = p->link;

	if (p->flags & IP6_TNL_F_CAP_XMIT) {
1174 1175 1176
		int strict = (ipv6_addr_type(&p->raddr) &
			      (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL));

1177 1178
		struct rt6_info *rt = rt6_lookup(dev_net(dev),
						 &p->raddr, &p->laddr,
1179
						 p->link, strict);
L
Linus Torvalds 已提交
1180 1181 1182 1183

		if (rt == NULL)
			return;

1184 1185
		if (rt->dst.dev) {
			dev->hard_header_len = rt->dst.dev->hard_header_len +
L
Linus Torvalds 已提交
1186 1187
				sizeof (struct ipv6hdr);

1188
			dev->mtu = rt->dst.dev->mtu - sizeof (struct ipv6hdr);
1189 1190
			if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
				dev->mtu-=8;
L
Linus Torvalds 已提交
1191 1192 1193 1194

			if (dev->mtu < IPV6_MIN_MTU)
				dev->mtu = IPV6_MIN_MTU;
		}
1195
		dst_release(&rt->dst);
L
Linus Torvalds 已提交
1196 1197 1198 1199
	}
}

/**
1200
 * ip6_tnl_change - update the tunnel parameters
L
Linus Torvalds 已提交
1201 1202 1203 1204
 *   @t: tunnel to be changed
 *   @p: tunnel configuration parameters
 *
 * Description:
1205
 *   ip6_tnl_change() updates the tunnel parameters
L
Linus Torvalds 已提交
1206 1207 1208
 **/

static int
1209
ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p)
L
Linus Torvalds 已提交
1210
{
A
Alexey Dobriyan 已提交
1211 1212
	t->parms.laddr = p->laddr;
	t->parms.raddr = p->raddr;
L
Linus Torvalds 已提交
1213 1214 1215 1216
	t->parms.flags = p->flags;
	t->parms.hop_limit = p->hop_limit;
	t->parms.encap_limit = p->encap_limit;
	t->parms.flowinfo = p->flowinfo;
1217
	t->parms.link = p->link;
1218
	t->parms.proto = p->proto;
1219
	ip6_tnl_dst_reset(t);
1220
	ip6_tnl_link_config(t);
L
Linus Torvalds 已提交
1221 1222 1223 1224
	return 0;
}

/**
1225
 * ip6_tnl_ioctl - configure ipv6 tunnels from userspace
L
Linus Torvalds 已提交
1226 1227 1228 1229 1230
 *   @dev: virtual device associated with tunnel
 *   @ifr: parameters passed from userspace
 *   @cmd: command to be performed
 *
 * Description:
1231
 *   ip6_tnl_ioctl() is used for managing IPv6 tunnels
1232
 *   from userspace.
L
Linus Torvalds 已提交
1233 1234 1235 1236 1237 1238 1239
 *
 *   The possible commands are the following:
 *     %SIOCGETTUNNEL: get tunnel parameters for device
 *     %SIOCADDTUNNEL: add tunnel matching given tunnel parameters
 *     %SIOCCHGTUNNEL: change tunnel parameters to those given
 *     %SIOCDELTUNNEL: delete tunnel
 *
1240
 *   The fallback device "ip6tnl0", created during module
L
Linus Torvalds 已提交
1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252
 *   initialization, can be used for creating other tunnel devices.
 *
 * Return:
 *   0 on success,
 *   %-EFAULT if unable to copy data to or from userspace,
 *   %-EPERM if current process hasn't %CAP_NET_ADMIN set
 *   %-EINVAL if passed tunnel parameters are invalid,
 *   %-EEXIST if changing a tunnel's parameters would cause a conflict
 *   %-ENODEV if attempting to change or delete a nonexisting device
 **/

static int
1253
ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
L
Linus Torvalds 已提交
1254 1255 1256 1257
{
	int err = 0;
	struct ip6_tnl_parm p;
	struct ip6_tnl *t = NULL;
1258 1259
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
L
Linus Torvalds 已提交
1260 1261 1262

	switch (cmd) {
	case SIOCGETTUNNEL:
1263
		if (dev == ip6n->fb_tnl_dev) {
1264
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) {
L
Linus Torvalds 已提交
1265 1266 1267
				err = -EFAULT;
				break;
			}
1268
			t = ip6_tnl_locate(net, &p, 0);
1269 1270
		}
		if (t == NULL)
1271
			t = netdev_priv(dev);
L
Linus Torvalds 已提交
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281
		memcpy(&p, &t->parms, sizeof (p));
		if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) {
			err = -EFAULT;
		}
		break;
	case SIOCADDTUNNEL:
	case SIOCCHGTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			break;
1282 1283
		err = -EFAULT;
		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
L
Linus Torvalds 已提交
1284
			break;
1285
		err = -EINVAL;
1286 1287
		if (p.proto != IPPROTO_IPV6 && p.proto != IPPROTO_IPIP &&
		    p.proto != 0)
L
Linus Torvalds 已提交
1288
			break;
1289
		t = ip6_tnl_locate(net, &p, cmd == SIOCADDTUNNEL);
1290
		if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
1291 1292 1293 1294 1295 1296 1297 1298
			if (t != NULL) {
				if (t->dev != dev) {
					err = -EEXIST;
					break;
				}
			} else
				t = netdev_priv(dev);

1299
			ip6_tnl_unlink(ip6n, t);
1300
			synchronize_net();
1301
			err = ip6_tnl_change(t, &p);
1302
			ip6_tnl_link(ip6n, t);
L
Linus Torvalds 已提交
1303 1304
			netdev_state_change(dev);
		}
1305
		if (t) {
L
Linus Torvalds 已提交
1306
			err = 0;
1307 1308 1309 1310 1311
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof (p)))
				err = -EFAULT;

		} else
			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
L
Linus Torvalds 已提交
1312 1313 1314 1315 1316 1317
		break;
	case SIOCDELTUNNEL:
		err = -EPERM;
		if (!capable(CAP_NET_ADMIN))
			break;

1318
		if (dev == ip6n->fb_tnl_dev) {
1319 1320
			err = -EFAULT;
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p)))
L
Linus Torvalds 已提交
1321
				break;
1322
			err = -ENOENT;
1323
			if ((t = ip6_tnl_locate(net, &p, 0)) == NULL)
L
Linus Torvalds 已提交
1324
				break;
1325
			err = -EPERM;
1326
			if (t->dev == ip6n->fb_tnl_dev)
L
Linus Torvalds 已提交
1327
				break;
1328
			dev = t->dev;
L
Linus Torvalds 已提交
1329
		}
1330 1331
		err = 0;
		unregister_netdevice(dev);
L
Linus Torvalds 已提交
1332 1333 1334 1335 1336 1337 1338 1339
		break;
	default:
		err = -EINVAL;
	}
	return err;
}

/**
1340
 * ip6_tnl_change_mtu - change mtu manually for tunnel device
L
Linus Torvalds 已提交
1341 1342 1343 1344 1345 1346 1347 1348 1349
 *   @dev: virtual device associated with tunnel
 *   @new_mtu: the new mtu
 *
 * Return:
 *   0 on success,
 *   %-EINVAL if mtu too small
 **/

static int
1350
ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
L
Linus Torvalds 已提交
1351 1352 1353 1354 1355 1356 1357 1358
{
	if (new_mtu < IPV6_MIN_MTU) {
		return -EINVAL;
	}
	dev->mtu = new_mtu;
	return 0;
}

1359 1360

static const struct net_device_ops ip6_tnl_netdev_ops = {
E
Eric Dumazet 已提交
1361
	.ndo_uninit	= ip6_tnl_dev_uninit,
1362
	.ndo_start_xmit = ip6_tnl_xmit,
E
Eric Dumazet 已提交
1363
	.ndo_do_ioctl	= ip6_tnl_ioctl,
1364
	.ndo_change_mtu = ip6_tnl_change_mtu,
E
Eric Dumazet 已提交
1365
	.ndo_get_stats	= ip6_get_stats,
1366 1367
};

E
Eric Dumazet 已提交
1368

L
Linus Torvalds 已提交
1369
/**
1370
 * ip6_tnl_dev_setup - setup virtual tunnel device
L
Linus Torvalds 已提交
1371 1372 1373 1374 1375 1376
 *   @dev: virtual device associated with tunnel
 *
 * Description:
 *   Initialize function pointers and device parameters
 **/

1377
static void ip6_tnl_dev_setup(struct net_device *dev)
L
Linus Torvalds 已提交
1378
{
1379 1380
	struct ip6_tnl *t;

1381
	dev->netdev_ops = &ip6_tnl_netdev_ops;
E
Eric Dumazet 已提交
1382
	dev->destructor = ip6_dev_free;
L
Linus Torvalds 已提交
1383 1384 1385 1386

	dev->type = ARPHRD_TUNNEL6;
	dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr);
	dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr);
1387 1388 1389
	t = netdev_priv(dev);
	if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
		dev->mtu-=8;
L
Linus Torvalds 已提交
1390 1391
	dev->flags |= IFF_NOARP;
	dev->addr_len = sizeof(struct in6_addr);
1392
	dev->features |= NETIF_F_NETNS_LOCAL;
1393
	dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
L
Linus Torvalds 已提交
1394 1395 1396 1397
}


/**
1398
 * ip6_tnl_dev_init_gen - general initializer for all tunnel devices
L
Linus Torvalds 已提交
1399 1400 1401
 *   @dev: virtual device associated with tunnel
 **/

E
Eric Dumazet 已提交
1402
static inline int
1403
ip6_tnl_dev_init_gen(struct net_device *dev)
L
Linus Torvalds 已提交
1404
{
1405
	struct ip6_tnl *t = netdev_priv(dev);
E
Eric Dumazet 已提交
1406

L
Linus Torvalds 已提交
1407
	t->dev = dev;
E
Eric Dumazet 已提交
1408 1409 1410 1411
	dev->tstats = alloc_percpu(struct pcpu_tstats);
	if (!dev->tstats)
		return -ENOMEM;
	return 0;
L
Linus Torvalds 已提交
1412 1413 1414
}

/**
1415
 * ip6_tnl_dev_init - initializer for all non fallback tunnel devices
L
Linus Torvalds 已提交
1416 1417 1418
 *   @dev: virtual device associated with tunnel
 **/

E
Eric Dumazet 已提交
1419
static int ip6_tnl_dev_init(struct net_device *dev)
L
Linus Torvalds 已提交
1420
{
1421
	struct ip6_tnl *t = netdev_priv(dev);
E
Eric Dumazet 已提交
1422 1423 1424 1425
	int err = ip6_tnl_dev_init_gen(dev);

	if (err)
		return err;
1426
	ip6_tnl_link_config(t);
E
Eric Dumazet 已提交
1427
	return 0;
L
Linus Torvalds 已提交
1428 1429 1430
}

/**
1431
 * ip6_fb_tnl_dev_init - initializer for fallback tunnel device
L
Linus Torvalds 已提交
1432 1433 1434 1435 1436
 *   @dev: fallback device
 *
 * Return: 0
 **/

E
Eric Dumazet 已提交
1437
static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev)
L
Linus Torvalds 已提交
1438
{
1439
	struct ip6_tnl *t = netdev_priv(dev);
1440 1441
	struct net *net = dev_net(dev);
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
E
Eric Dumazet 已提交
1442 1443 1444 1445
	int err = ip6_tnl_dev_init_gen(dev);

	if (err)
		return err;
1446

1447
	t->parms.proto = IPPROTO_IPV6;
L
Linus Torvalds 已提交
1448
	dev_hold(dev);
1449 1450 1451

	ip6_tnl_link_config(t);

1452
	rcu_assign_pointer(ip6n->tnls_wc[0], t);
E
Eric Dumazet 已提交
1453
	return 0;
L
Linus Torvalds 已提交
1454 1455
}

1456
static struct xfrm6_tunnel ip4ip6_handler __read_mostly = {
1457 1458 1459 1460 1461
	.handler	= ip4ip6_rcv,
	.err_handler	= ip4ip6_err,
	.priority	=	1,
};

1462
static struct xfrm6_tunnel ip6ip6_handler __read_mostly = {
1463 1464
	.handler	= ip6ip6_rcv,
	.err_handler	= ip6ip6_err,
H
Herbert Xu 已提交
1465
	.priority	=	1,
L
Linus Torvalds 已提交
1466 1467
};

1468
static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n)
1469 1470 1471
{
	int h;
	struct ip6_tnl *t;
1472
	LIST_HEAD(list);
1473 1474

	for (h = 0; h < HASH_SIZE; h++) {
E
Eric Dumazet 已提交
1475
		t = rtnl_dereference(ip6n->tnls_r_l[h]);
1476 1477
		while (t != NULL) {
			unregister_netdevice_queue(t->dev, &list);
E
Eric Dumazet 已提交
1478
			t = rtnl_dereference(t->next);
1479
		}
1480 1481
	}

E
Eric Dumazet 已提交
1482
	t = rtnl_dereference(ip6n->tnls_wc[0]);
1483 1484
	unregister_netdevice_queue(t->dev, &list);
	unregister_netdevice_many(&list);
1485 1486
}

1487
static int __net_init ip6_tnl_init_net(struct net *net)
1488
{
1489
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1490
	struct ip6_tnl *t = NULL;
1491 1492
	int err;

1493 1494 1495
	ip6n->tnls[0] = ip6n->tnls_wc;
	ip6n->tnls[1] = ip6n->tnls_r_l;

1496 1497 1498 1499 1500 1501
	err = -ENOMEM;
	ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
				      ip6_tnl_dev_setup);

	if (!ip6n->fb_tnl_dev)
		goto err_alloc_dev;
1502
	dev_net_set(ip6n->fb_tnl_dev, net);
1503

E
Eric Dumazet 已提交
1504 1505 1506
	err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev);
	if (err < 0)
		goto err_register;
1507 1508 1509 1510

	err = register_netdev(ip6n->fb_tnl_dev);
	if (err < 0)
		goto err_register;
1511 1512 1513 1514

	t = netdev_priv(ip6n->fb_tnl_dev);

	strcpy(t->parms.name, ip6n->fb_tnl_dev->name);
1515 1516
	return 0;

1517
err_register:
E
Eric Dumazet 已提交
1518
	ip6_dev_free(ip6n->fb_tnl_dev);
1519
err_alloc_dev:
1520 1521 1522
	return err;
}

1523
static void __net_exit ip6_tnl_exit_net(struct net *net)
1524
{
1525
	struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id);
1526

1527 1528 1529
	rtnl_lock();
	ip6_tnl_destroy_tunnels(ip6n);
	rtnl_unlock();
1530 1531 1532 1533 1534
}

static struct pernet_operations ip6_tnl_net_ops = {
	.init = ip6_tnl_init_net,
	.exit = ip6_tnl_exit_net,
1535 1536
	.id   = &ip6_tnl_net_id,
	.size = sizeof(struct ip6_tnl_net),
1537 1538
};

L
Linus Torvalds 已提交
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548
/**
 * ip6_tunnel_init - register protocol and reserve needed resources
 *
 * Return: 0 on success
 **/

static int __init ip6_tunnel_init(void)
{
	int  err;

1549 1550 1551 1552 1553 1554
	err = register_pernet_device(&ip6_tnl_net_ops);
	if (err < 0)
		goto out_pernet;

	err = xfrm6_tunnel_register(&ip4ip6_handler, AF_INET);
	if (err < 0) {
1555
		pr_err("%s: can't register ip4ip6\n", __func__);
1556
		goto out_ip4ip6;
1557 1558
	}

1559 1560
	err = xfrm6_tunnel_register(&ip6ip6_handler, AF_INET6);
	if (err < 0) {
1561
		pr_err("%s: can't register ip6ip6\n", __func__);
1562
		goto out_ip6ip6;
L
Linus Torvalds 已提交
1563
	}
1564

L
Linus Torvalds 已提交
1565
	return 0;
1566 1567

out_ip6ip6:
1568
	xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET);
1569 1570 1571
out_ip4ip6:
	unregister_pernet_device(&ip6_tnl_net_ops);
out_pernet:
L
Linus Torvalds 已提交
1572 1573 1574 1575 1576 1577 1578 1579 1580
	return err;
}

/**
 * ip6_tunnel_cleanup - free resources and unregister protocol
 **/

static void __exit ip6_tunnel_cleanup(void)
{
1581
	if (xfrm6_tunnel_deregister(&ip4ip6_handler, AF_INET))
1582
		pr_info("%s: can't deregister ip4ip6\n", __func__);
1583

1584
	if (xfrm6_tunnel_deregister(&ip6ip6_handler, AF_INET6))
1585
		pr_info("%s: can't deregister ip6ip6\n", __func__);
L
Linus Torvalds 已提交
1586

1587
	unregister_pernet_device(&ip6_tnl_net_ops);
L
Linus Torvalds 已提交
1588 1589 1590 1591
}

module_init(ip6_tunnel_init);
module_exit(ip6_tunnel_cleanup);