sit.c 45.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT)
 *	Linux INET6 implementation
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16
 *	Alexey Kuznetsov	<kuznet@ms2.inr.ac.ru>
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 *
 *	Changes:
 * Roger Venning <r.venning@telstra.com>:	6to4 support
 * Nate Thompson <nate@thebog.net>:		6to4 support
17
 * Fred Templin <fred.l.templin@boeing.com>:	isatap support
L
Linus Torvalds 已提交
18 19
 */

20 21
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

L
Linus Torvalds 已提交
22
#include <linux/module.h>
23
#include <linux/capability.h>
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/in6.h>
#include <linux/netdevice.h>
#include <linux/if_arp.h>
#include <linux/icmp.h>
33
#include <linux/slab.h>
34
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
35 36
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
37
#include <linux/if_ether.h>
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51

#include <net/sock.h>
#include <net/snmp.h>

#include <net/ipv6.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/ip.h>
#include <net/udp.h>
#include <net/icmp.h>
52
#include <net/ip_tunnels.h>
L
Linus Torvalds 已提交
53 54 55
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/dsfield.h>
56 57
#include <net/net_namespace.h>
#include <net/netns/generic.h>
L
Linus Torvalds 已提交
58 59 60 61 62 63 64

/*
   This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c

   For comments look at net/ipv4/ip_gre.c --ANK
 */

65
#define IP6_SIT_HASH_SIZE  16
A
Al Viro 已提交
66
#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
L
Linus Torvalds 已提交
67

68 69 70 71
static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

E
Eric Dumazet 已提交
72
static int ipip6_tunnel_init(struct net_device *dev);
L
Linus Torvalds 已提交
73
static void ipip6_tunnel_setup(struct net_device *dev);
E
Eric Dumazet 已提交
74
static void ipip6_dev_free(struct net_device *dev);
75 76
static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
		      __be32 *v4dst);
77
static struct rtnl_link_ops sit_link_ops __read_mostly;
L
Linus Torvalds 已提交
78

79
static unsigned int sit_net_id __read_mostly;
80
struct sit_net {
81 82 83
	struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE];
	struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE];
	struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE];
E
Eric Dumazet 已提交
84 85
	struct ip_tunnel __rcu *tunnels_wc[1];
	struct ip_tunnel __rcu **tunnels[4];
86

87
	struct net_device *fb_tunnel_dev;
88 89
};

E
Eric Dumazet 已提交
90 91 92
/*
 * Must be invoked with rcu_read_lock
 */
E
Eldad Zack 已提交
93
static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
94 95 96
					     struct net_device *dev,
					     __be32 remote, __be32 local,
					     int sifindex)
L
Linus Torvalds 已提交
97
{
E
Eric Dumazet 已提交
98 99
	unsigned int h0 = HASH(remote);
	unsigned int h1 = HASH(local);
L
Linus Torvalds 已提交
100
	struct ip_tunnel *t;
101
	struct sit_net *sitn = net_generic(net, sit_net_id);
102
	int ifindex = dev ? dev->ifindex : 0;
L
Linus Torvalds 已提交
103

A
Amerigo Wang 已提交
104
	for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
L
Linus Torvalds 已提交
105
		if (local == t->parms.iph.saddr &&
106
		    remote == t->parms.iph.daddr &&
107 108
		    (!dev || !t->parms.link || ifindex == t->parms.link ||
		     sifindex == t->parms.link) &&
109
		    (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
110 111
			return t;
	}
A
Amerigo Wang 已提交
112
	for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) {
113
		if (remote == t->parms.iph.daddr &&
114 115
		    (!dev || !t->parms.link || ifindex == t->parms.link ||
		     sifindex == t->parms.link) &&
116
		    (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
117 118
			return t;
	}
A
Amerigo Wang 已提交
119
	for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) {
120
		if (local == t->parms.iph.saddr &&
121 122
		    (!dev || !t->parms.link || ifindex == t->parms.link ||
		     sifindex == t->parms.link) &&
123
		    (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
124 125
			return t;
	}
E
Eric Dumazet 已提交
126
	t = rcu_dereference(sitn->tunnels_wc[0]);
127
	if (t && (t->dev->flags & IFF_UP))
L
Linus Torvalds 已提交
128 129 130 131
		return t;
	return NULL;
}

E
Eric Dumazet 已提交
132
static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn,
133
		struct ip_tunnel_parm *parms)
L
Linus Torvalds 已提交
134
{
135 136
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
E
Eric Dumazet 已提交
137
	unsigned int h = 0;
L
Linus Torvalds 已提交
138 139 140 141 142 143 144 145 146 147
	int prio = 0;

	if (remote) {
		prio |= 2;
		h ^= HASH(remote);
	}
	if (local) {
		prio |= 1;
		h ^= HASH(local);
	}
148
	return &sitn->tunnels[prio][h];
L
Linus Torvalds 已提交
149 150
}

E
Eric Dumazet 已提交
151
static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn,
152
		struct ip_tunnel *t)
153
{
154
	return __ipip6_bucket(sitn, &t->parms);
155 156
}

157
static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t)
L
Linus Torvalds 已提交
158
{
E
Eric Dumazet 已提交
159 160 161 162 163 164 165
	struct ip_tunnel __rcu **tp;
	struct ip_tunnel *iter;

	for (tp = ipip6_bucket(sitn, t);
	     (iter = rtnl_dereference(*tp)) != NULL;
	     tp = &iter->next) {
		if (t == iter) {
166
			rcu_assign_pointer(*tp, t->next);
L
Linus Torvalds 已提交
167 168 169 170 171
			break;
		}
	}
}

172
static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t)
L
Linus Torvalds 已提交
173
{
E
Eric Dumazet 已提交
174
	struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t);
L
Linus Torvalds 已提交
175

176 177
	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
	rcu_assign_pointer(*tp, t);
L
Linus Torvalds 已提交
178 179
}

180
static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn)
181 182
{
#ifdef CONFIG_IPV6_SIT_6RD
183 184
	struct ip_tunnel *t = netdev_priv(dev);

185
	if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
186 187 188 189 190 191 192 193 194 195 196
		ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
		t->ip6rd.relay_prefix = 0;
		t->ip6rd.prefixlen = 16;
		t->ip6rd.relay_prefixlen = 0;
	} else {
		struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev);
		memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd));
	}
#endif
}

197 198 199 200 201 202 203
static int ipip6_tunnel_create(struct net_device *dev)
{
	struct ip_tunnel *t = netdev_priv(dev);
	struct net *net = dev_net(dev);
	struct sit_net *sitn = net_generic(net, sit_net_id);
	int err;

204 205
	memcpy(dev->dev_addr, &t->parms.iph.saddr, 4);
	memcpy(dev->broadcast, &t->parms.iph.daddr, 4);
206

N
Nicolas Dichtel 已提交
207
	if ((__force u16)t->parms.i_flags & SIT_ISATAP)
208 209
		dev->priv_flags |= IFF_ISATAP;

210 211
	dev->rtnl_link_ops = &sit_link_ops;

212 213 214 215
	err = register_netdevice(dev);
	if (err < 0)
		goto out;

216 217
	ipip6_tunnel_clone_6rd(dev, sitn);

218 219 220 221 222 223 224 225 226
	dev_hold(dev);

	ipip6_tunnel_link(sitn, t);
	return 0;

out:
	return err;
}

E
Eric Dumazet 已提交
227
static struct ip_tunnel *ipip6_tunnel_locate(struct net *net,
228
		struct ip_tunnel_parm *parms, int create)
L
Linus Torvalds 已提交
229
{
A
Al Viro 已提交
230 231
	__be32 remote = parms->iph.daddr;
	__be32 local = parms->iph.saddr;
E
Eric Dumazet 已提交
232 233
	struct ip_tunnel *t, *nt;
	struct ip_tunnel __rcu **tp;
L
Linus Torvalds 已提交
234 235
	struct net_device *dev;
	char name[IFNAMSIZ];
236
	struct sit_net *sitn = net_generic(net, sit_net_id);
L
Linus Torvalds 已提交
237

E
Eric Dumazet 已提交
238 239 240
	for (tp = __ipip6_bucket(sitn, parms);
	    (t = rtnl_dereference(*tp)) != NULL;
	     tp = &t->next) {
241
		if (local == t->parms.iph.saddr &&
242 243
		    remote == t->parms.iph.daddr &&
		    parms->link == t->parms.link) {
244 245 246 247 248
			if (create)
				return NULL;
			else
				return t;
		}
L
Linus Torvalds 已提交
249 250 251 252
	}
	if (!create)
		goto failed;

253 254 255
	if (parms->name[0]) {
		if (!dev_valid_name(parms->name))
			goto failed;
L
Linus Torvalds 已提交
256
		strlcpy(name, parms->name, IFNAMSIZ);
257
	} else {
E
Eric Dumazet 已提交
258
		strcpy(name, "sit%d");
259
	}
260 261
	dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
			   ipip6_tunnel_setup);
262
	if (!dev)
L
Linus Torvalds 已提交
263 264
		return NULL;

265 266
	dev_net_set(dev, net);

267
	nt = netdev_priv(dev);
268

L
Linus Torvalds 已提交
269
	nt->parms = *parms;
270
	if (ipip6_tunnel_create(dev) < 0)
271
		goto failed_free;
L
Linus Torvalds 已提交
272 273 274

	return nt;

275
failed_free:
276
	free_netdev(dev);
L
Linus Torvalds 已提交
277 278 279 280
failed:
	return NULL;
}

E
Eric Dumazet 已提交
281 282 283 284 285
#define for_each_prl_rcu(start)			\
	for (prl = rcu_dereference(start);	\
	     prl;				\
	     prl = rcu_dereference(prl->next))

286
static struct ip_tunnel_prl_entry *
287
__ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr)
288
{
E
Eric Dumazet 已提交
289
	struct ip_tunnel_prl_entry *prl;
290

E
Eric Dumazet 已提交
291 292
	for_each_prl_rcu(t->prl)
		if (prl->addr == addr)
293
			break;
E
Eric Dumazet 已提交
294
	return prl;
295 296 297

}

298 299
static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
				struct ip_tunnel_prl __user *a)
300
{
301
	struct ip_tunnel_prl kprl, *kp;
302 303 304 305
	struct ip_tunnel_prl_entry *prl;
	unsigned int cmax, c = 0, ca, len;
	int ret = 0;

306 307 308 309
	if (copy_from_user(&kprl, a, sizeof(kprl)))
		return -EFAULT;
	cmax = kprl.datalen / sizeof(kprl);
	if (cmax > 1 && kprl.addr != htonl(INADDR_ANY))
310 311 312 313 314 315
		cmax = 1;

	/* For simple GET or for root users,
	 * we try harder to allocate.
	 */
	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
316
		kcalloc(cmax, sizeof(*kp), GFP_KERNEL | __GFP_NOWARN) :
317 318
		NULL;

E
Eric Dumazet 已提交
319
	rcu_read_lock();
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336

	ca = t->prl_count < cmax ? t->prl_count : cmax;

	if (!kp) {
		/* We don't try hard to allocate much memory for
		 * non-root users.
		 * For root users, retry allocating enough memory for
		 * the answer.
		 */
		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
		if (!kp) {
			ret = -ENOMEM;
			goto out;
		}
	}

	c = 0;
E
Eric Dumazet 已提交
337
	for_each_prl_rcu(t->prl) {
338
		if (c >= cmax)
339
			break;
340
		if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr)
341 342 343 344
			continue;
		kp[c].addr = prl->addr;
		kp[c].flags = prl->flags;
		c++;
345
		if (kprl.addr != htonl(INADDR_ANY))
346 347 348
			break;
	}
out:
E
Eric Dumazet 已提交
349
	rcu_read_unlock();
350 351

	len = sizeof(*kp) * c;
352 353 354
	ret = 0;
	if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen))
		ret = -EFAULT;
355 356 357

	kfree(kp);

358
	return ret;
359 360
}

361 362 363 364
static int
ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg)
{
	struct ip_tunnel_prl_entry *p;
365 366
	int err = 0;

367 368 369
	if (a->addr == htonl(INADDR_ANY))
		return -EINVAL;

E
Eric Dumazet 已提交
370
	ASSERT_RTNL();
371

E
Eric Dumazet 已提交
372
	for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) {
373
		if (p->addr == a->addr) {
E
Eric Dumazet 已提交
374 375 376 377
			if (chg) {
				p->flags = a->flags;
				goto out;
			}
378 379
			err = -EEXIST;
			goto out;
380 381 382
		}
	}

383 384 385 386
	if (chg) {
		err = -ENXIO;
		goto out;
	}
387 388

	p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL);
389 390 391 392
	if (!p) {
		err = -ENOBUFS;
		goto out;
	}
393 394

	p->next = t->prl;
395 396
	p->addr = a->addr;
	p->flags = a->flags;
E
Eric Dumazet 已提交
397
	t->prl_count++;
398
	rcu_assign_pointer(t->prl, p);
399 400
out:
	return err;
401 402
}

E
Eric Dumazet 已提交
403 404 405 406 407 408
static void prl_list_destroy_rcu(struct rcu_head *head)
{
	struct ip_tunnel_prl_entry *p, *n;

	p = container_of(head, struct ip_tunnel_prl_entry, rcu_head);
	do {
409
		n = rcu_dereference_protected(p->next, 1);
E
Eric Dumazet 已提交
410 411 412 413 414
		kfree(p);
		p = n;
	} while (p);
}

415 416 417
static int
ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a)
{
418 419
	struct ip_tunnel_prl_entry *x;
	struct ip_tunnel_prl_entry __rcu **p;
420 421
	int err = 0;

E
Eric Dumazet 已提交
422
	ASSERT_RTNL();
423

424
	if (a && a->addr != htonl(INADDR_ANY)) {
425 426 427 428
		for (p = &t->prl;
		     (x = rtnl_dereference(*p)) != NULL;
		     p = &x->next) {
			if (x->addr == a->addr) {
429
				*p = x->next;
430
				kfree_rcu(x, rcu_head);
431
				t->prl_count--;
432
				goto out;
433 434
			}
		}
435
		err = -ENXIO;
436
	} else {
437 438
		x = rtnl_dereference(t->prl);
		if (x) {
E
Eric Dumazet 已提交
439 440 441
			t->prl_count = 0;
			call_rcu(&x->rcu_head, prl_list_destroy_rcu);
			t->prl = NULL;
442 443
		}
	}
444
out:
445
	return err;
446 447 448
}

static int
449
isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t)
450
{
451
	struct ip_tunnel_prl_entry *p;
452 453
	int ok = 1;

E
Eric Dumazet 已提交
454
	rcu_read_lock();
455
	p = __ipip6_tunnel_locate_prl(t, iph->saddr);
456
	if (p) {
457
		if (p->flags & PRL_DEFAULT)
458 459 460 461
			skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT;
		else
			skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT;
	} else {
462 463
		const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr;

464 465
		if (ipv6_addr_is_isatap(addr6) &&
		    (addr6->s6_addr32[3] == iph->saddr) &&
466
		    ipv6_chk_prefix(addr6, t->dev))
467 468 469 470
			skb->ndisc_nodetype = NDISC_NODETYPE_HOST;
		else
			ok = 0;
	}
E
Eric Dumazet 已提交
471
	rcu_read_unlock();
472 473 474
	return ok;
}

L
Linus Torvalds 已提交
475 476
static void ipip6_tunnel_uninit(struct net_device *dev)
{
N
Nicolas Dichtel 已提交
477 478
	struct ip_tunnel *tunnel = netdev_priv(dev);
	struct sit_net *sitn = net_generic(tunnel->net, sit_net_id);
479

480
	if (dev == sitn->fb_tunnel_dev) {
481
		RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL);
L
Linus Torvalds 已提交
482
	} else {
N
Nicolas Dichtel 已提交
483 484
		ipip6_tunnel_unlink(sitn, tunnel);
		ipip6_tunnel_del_prl(tunnel, NULL);
L
Linus Torvalds 已提交
485
	}
486
	dst_cache_reset(&tunnel->dst_cache);
E
Eric Dumazet 已提交
487
	dev_put(dev);
L
Linus Torvalds 已提交
488 489
}

490
static int ipip6_err(struct sk_buff *skb, u32 info)
L
Linus Torvalds 已提交
491
{
492
	const struct iphdr *iph = (const struct iphdr *)skb->data;
493 494
	const int type = icmp_hdr(skb)->type;
	const int code = icmp_hdr(skb)->code;
495
	unsigned int data_len = 0;
L
Linus Torvalds 已提交
496
	struct ip_tunnel *t;
497
	int sifindex;
498
	int err;
L
Linus Torvalds 已提交
499 500 501 502

	switch (type) {
	default:
	case ICMP_PARAMETERPROB:
503
		return 0;
L
Linus Torvalds 已提交
504 505 506 507 508

	case ICMP_DEST_UNREACH:
		switch (code) {
		case ICMP_SR_FAILED:
			/* Impossible event. */
509
			return 0;
L
Linus Torvalds 已提交
510 511 512 513 514 515 516 517 518 519
		default:
			/* All others are translated to HOST_UNREACH.
			   rfc2003 contains "deep thoughts" about NET_UNREACH,
			   I believe they are just ether pollution. --ANK
			 */
			break;
		}
		break;
	case ICMP_TIME_EXCEEDED:
		if (code != ICMP_EXC_TTL)
520
			return 0;
521
		data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */
L
Linus Torvalds 已提交
522
		break;
523 524
	case ICMP_REDIRECT:
		break;
L
Linus Torvalds 已提交
525 526
	}

527 528
	err = -ENOENT;

529 530 531
	sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
	t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
				iph->daddr, iph->saddr, sifindex);
532
	if (!t)
533 534 535 536
		goto out;

	if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
		ipv4_update_pmtu(skb, dev_net(skb->dev), info,
537
				 t->parms.link, 0, iph->protocol, 0);
538 539 540
		err = 0;
		goto out;
	}
541
	if (type == ICMP_REDIRECT) {
542
		ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
543
			      iph->protocol, 0);
544 545 546
		err = 0;
		goto out;
	}
547

548
	err = 0;
549 550
	if (__in6_dev_get(skb->dev) &&
	    !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len))
L
Linus Torvalds 已提交
551
		goto out;
552

553
	if (t->parms.iph.daddr == 0)
554 555
		goto out;

L
Linus Torvalds 已提交
556 557 558
	if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
		goto out;

559
	if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
L
Linus Torvalds 已提交
560 561 562 563 564
		t->err_count++;
	else
		t->err_count = 1;
	t->err_time = jiffies;
out:
565
	return err;
L
Linus Torvalds 已提交
566 567
}

568 569 570 571 572 573 574 575 576
static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr,
				  const struct in6_addr *v6addr)
{
	__be32 v4embed = 0;
	if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed)
		return true;
	return false;
}

577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640
/* Checks if an address matches an address on the tunnel interface.
 * Used to detect the NAT of proto 41 packets and let them pass spoofing test.
 * Long story:
 * This function is called after we considered the packet as spoofed
 * in is_spoofed_6rd.
 * We may have a router that is doing NAT for proto 41 packets
 * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb
 * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd
 * function will return true, dropping the packet.
 * But, we can still check if is spoofed against the IP
 * addresses associated with the interface.
 */
static bool only_dnatted(const struct ip_tunnel *tunnel,
	const struct in6_addr *v6dst)
{
	int prefix_len;

#ifdef CONFIG_IPV6_SIT_6RD
	prefix_len = tunnel->ip6rd.prefixlen + 32
		- tunnel->ip6rd.relay_prefixlen;
#else
	prefix_len = 48;
#endif
	return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev);
}

/* Returns true if a packet is spoofed */
static bool packet_is_spoofed(struct sk_buff *skb,
			      const struct iphdr *iph,
			      struct ip_tunnel *tunnel)
{
	const struct ipv6hdr *ipv6h;

	if (tunnel->dev->priv_flags & IFF_ISATAP) {
		if (!isatap_chksrc(skb, iph, tunnel))
			return true;

		return false;
	}

	if (tunnel->dev->flags & IFF_POINTOPOINT)
		return false;

	ipv6h = ipv6_hdr(skb);

	if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) {
		net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
				     &iph->saddr, &ipv6h->saddr,
				     &iph->daddr, &ipv6h->daddr);
		return true;
	}

	if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr)))
		return false;

	if (only_dnatted(tunnel, &ipv6h->daddr))
		return false;

	net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n",
			     &iph->saddr, &ipv6h->saddr,
			     &iph->daddr, &ipv6h->daddr);
	return true;
}

L
Linus Torvalds 已提交
641 642
static int ipip6_rcv(struct sk_buff *skb)
{
643
	const struct iphdr *iph = ip_hdr(skb);
L
Linus Torvalds 已提交
644
	struct ip_tunnel *tunnel;
645
	int sifindex;
646
	int err;
L
Linus Torvalds 已提交
647

648
	sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
649
	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
650
				     iph->saddr, iph->daddr, sifindex);
651
	if (tunnel) {
652
		struct pcpu_sw_netstats *tstats;
E
Eric Dumazet 已提交
653

654 655 656 657
		if (tunnel->parms.iph.protocol != IPPROTO_IPV6 &&
		    tunnel->parms.iph.protocol != 0)
			goto out;

658
		skb->mac_header = skb->network_header;
659
		skb_reset_network_header(skb);
660
		IPCB(skb)->flags = 0;
661
		skb->dev = tunnel->dev;
F
Fred L. Templin 已提交
662

663 664 665
		if (packet_is_spoofed(skb, iph, tunnel)) {
			tunnel->dev->stats.rx_errors++;
			goto out;
666 667
		}

668 669 670
		if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6),
		    !net_eq(tunnel->net, dev_net(tunnel->dev))))
			goto out;
671

672 673 674 675
		/* skb can be uncloned in iptunnel_pull_header, so
		 * old iph is no longer valid
		 */
		iph = (const struct iphdr *)skb_mac_header(skb);
676 677 678 679 680 681 682 683 684 685
		err = IP_ECN_decapsulate(iph, skb);
		if (unlikely(err)) {
			if (log_ecn_error)
				net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
						     &iph->saddr, iph->tos);
			if (err > 1) {
				++tunnel->dev->stats.rx_frame_errors;
				++tunnel->dev->stats.rx_errors;
				goto out;
			}
F
Fred L. Templin 已提交
686
		}
687

E
Eric Dumazet 已提交
688
		tstats = this_cpu_ptr(tunnel->dev->tstats);
689
		u64_stats_update_begin(&tstats->syncp);
E
Eric Dumazet 已提交
690 691
		tstats->rx_packets++;
		tstats->rx_bytes += skb->len;
692
		u64_stats_update_end(&tstats->syncp);
E
Eric Dumazet 已提交
693

694
		netif_rx(skb);
E
Eric Dumazet 已提交
695

L
Linus Torvalds 已提交
696 697 698
		return 0;
	}

699 700
	/* no tunnel matched,  let upstream know, ipsec may handle it */
	return 1;
L
Linus Torvalds 已提交
701
out:
702
	kfree_skb(skb);
L
Linus Torvalds 已提交
703 704 705
	return 0;
}

S
Simon Horman 已提交
706
static const struct tnl_ptk_info ipip_tpi = {
707 708 709 710
	/* no tunnel info required for ipip. */
	.proto = htons(ETH_P_IP),
};

S
Simon Horman 已提交
711 712 713 714 715 716 717 718
#if IS_ENABLED(CONFIG_MPLS)
static const struct tnl_ptk_info mplsip_tpi = {
	/* no tunnel info required for mplsip. */
	.proto = htons(ETH_P_MPLS_UC),
};
#endif

static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto)
719
{
720
	const struct iphdr *iph;
721
	struct ip_tunnel *tunnel;
722 723 724
	int sifindex;

	sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0;
725

726
	iph = ip_hdr(skb);
727
	tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
728
				     iph->saddr, iph->daddr, sifindex);
729
	if (tunnel) {
S
Simon Horman 已提交
730 731 732
		const struct tnl_ptk_info *tpi;

		if (tunnel->parms.iph.protocol != ipproto &&
733 734 735 736 737
		    tunnel->parms.iph.protocol != 0)
			goto drop;

		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
			goto drop;
S
Simon Horman 已提交
738 739 740 741 742 743 744
#if IS_ENABLED(CONFIG_MPLS)
		if (ipproto == IPPROTO_MPLS)
			tpi = &mplsip_tpi;
		else
#endif
			tpi = &ipip_tpi;
		if (iptunnel_pull_header(skb, 0, tpi->proto, false))
745
			goto drop;
S
Simon Horman 已提交
746
		return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error);
747 748 749 750 751 752 753 754 755
	}

	return 1;

drop:
	kfree_skb(skb);
	return 0;
}

S
Simon Horman 已提交
756 757 758 759 760 761 762 763 764 765 766 767
static int ipip_rcv(struct sk_buff *skb)
{
	return sit_tunnel_rcv(skb, IPPROTO_IPIP);
}

#if IS_ENABLED(CONFIG_MPLS)
static int mplsip_rcv(struct sk_buff *skb)
{
	return sit_tunnel_rcv(skb, IPPROTO_MPLS);
}
#endif

768
/*
769 770
 * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function
 * stores the embedded IPv4 address in v4dst and returns true.
771
 */
772 773
static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst,
		      __be32 *v4dst)
L
Linus Torvalds 已提交
774
{
775 776 777
#ifdef CONFIG_IPV6_SIT_6RD
	if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix,
			      tunnel->ip6rd.prefixlen)) {
E
Eric Dumazet 已提交
778
		unsigned int pbw0, pbi0;
779 780 781 782 783 784
		int pbi1;
		u32 d;

		pbw0 = tunnel->ip6rd.prefixlen >> 5;
		pbi0 = tunnel->ip6rd.prefixlen & 0x1f;

785 786 787
		d = tunnel->ip6rd.relay_prefixlen < 32 ?
			(ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >>
		    tunnel->ip6rd.relay_prefixlen : 0;
788 789 790

		pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen;
		if (pbi1 > 0)
791
			d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >>
792 793
			     (32 - pbi1);

794 795
		*v4dst = tunnel->ip6rd.relay_prefix | htonl(d);
		return true;
796 797
	}
#else
L
Linus Torvalds 已提交
798
	if (v6dst->s6_addr16[0] == htons(0x2002)) {
799
		/* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */
800 801
		memcpy(v4dst, &v6dst->s6_addr16[1], 4);
		return true;
L
Linus Torvalds 已提交
802
	}
803
#endif
804 805 806 807 808 809 810 811
	return false;
}

static inline __be32 try_6rd(struct ip_tunnel *tunnel,
			     const struct in6_addr *v6dst)
{
	__be32 dst = 0;
	check_6rd(tunnel, v6dst, &dst);
L
Linus Torvalds 已提交
812 813 814 815 816 817 818 819
	return dst;
}

/*
 *	This function assumes it is being called from dev_queue_xmit()
 *	and that skb is filled properly by that function.
 */

820 821
static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
				     struct net_device *dev)
L
Linus Torvalds 已提交
822
{
823
	struct ip_tunnel *tunnel = netdev_priv(dev);
824 825
	const struct iphdr  *tiph = &tunnel->parms.iph;
	const struct ipv6hdr *iph6 = ipv6_hdr(skb);
L
Linus Torvalds 已提交
826
	u8     tos = tunnel->parms.iph.tos;
827
	__be16 df = tiph->frag_off;
828 829 830
	struct rtable *rt;		/* Route to the other host */
	struct net_device *tdev;	/* Device to other host */
	unsigned int max_headroom;	/* The extra header space needed */
A
Al Viro 已提交
831
	__be32 dst = tiph->daddr;
832
	struct flowi4 fl4;
L
Linus Torvalds 已提交
833
	int    mtu;
834
	const struct in6_addr *addr6;
L
Linus Torvalds 已提交
835
	int addr_type;
836
	u8 ttl;
837 838
	u8 protocol = IPPROTO_IPV6;
	int t_hlen = tunnel->hlen + sizeof(struct iphdr);
L
Linus Torvalds 已提交
839

840 841 842
	if (tos == 1)
		tos = ipv6_get_dsfield(iph6);

F
Fred L. Templin 已提交
843 844 845
	/* ISATAP (RFC4214) - must come before 6to4 */
	if (dev->priv_flags & IFF_ISATAP) {
		struct neighbour *neigh = NULL;
846
		bool do_tx_error = false;
F
Fred L. Templin 已提交
847

E
Eric Dumazet 已提交
848
		if (skb_dst(skb))
849
			neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
F
Fred L. Templin 已提交
850

851
		if (!neigh) {
852
			net_dbg_ratelimited("nexthop == NULL\n");
F
Fred L. Templin 已提交
853 854 855
			goto tx_error;
		}

E
Eldad Zack 已提交
856
		addr6 = (const struct in6_addr *)&neigh->primary_key;
F
Fred L. Templin 已提交
857 858 859 860 861 862
		addr_type = ipv6_addr_type(addr6);

		if ((addr_type & IPV6_ADDR_UNICAST) &&
		     ipv6_addr_is_isatap(addr6))
			dst = addr6->s6_addr32[3];
		else
863 864 865 866
			do_tx_error = true;

		neigh_release(neigh);
		if (do_tx_error)
F
Fred L. Templin 已提交
867 868 869
			goto tx_error;
	}

L
Linus Torvalds 已提交
870
	if (!dst)
871
		dst = try_6rd(tunnel, &iph6->daddr);
L
Linus Torvalds 已提交
872 873 874

	if (!dst) {
		struct neighbour *neigh = NULL;
875
		bool do_tx_error = false;
L
Linus Torvalds 已提交
876

E
Eric Dumazet 已提交
877
		if (skb_dst(skb))
878
			neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
L
Linus Torvalds 已提交
879

880
		if (!neigh) {
881
			net_dbg_ratelimited("nexthop == NULL\n");
L
Linus Torvalds 已提交
882 883 884
			goto tx_error;
		}

E
Eldad Zack 已提交
885
		addr6 = (const struct in6_addr *)&neigh->primary_key;
L
Linus Torvalds 已提交
886 887 888
		addr_type = ipv6_addr_type(addr6);

		if (addr_type == IPV6_ADDR_ANY) {
889
			addr6 = &ipv6_hdr(skb)->daddr;
L
Linus Torvalds 已提交
890 891 892
			addr_type = ipv6_addr_type(addr6);
		}

893 894 895 896
		if ((addr_type & IPV6_ADDR_COMPATv4) != 0)
			dst = addr6->s6_addr32[3];
		else
			do_tx_error = true;
L
Linus Torvalds 已提交
897

898 899 900
		neigh_release(neigh);
		if (do_tx_error)
			goto tx_error;
L
Linus Torvalds 已提交
901 902
	}

903 904 905 906 907 908
	flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark,
			   RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6,
			   0, dst, tiph->saddr, 0, 0,
			   sock_net_uid(tunnel->net, NULL));
	rt = ip_route_output_flow(tunnel->net, &fl4, NULL);

909 910 911
	if (IS_ERR(rt)) {
		dev->stats.tx_carrier_errors++;
		goto tx_error_icmp;
L
Linus Torvalds 已提交
912 913 914
	}
	if (rt->rt_type != RTN_UNICAST) {
		ip_rt_put(rt);
E
Eric Dumazet 已提交
915
		dev->stats.tx_carrier_errors++;
L
Linus Torvalds 已提交
916 917
		goto tx_error_icmp;
	}
918
	tdev = rt->dst.dev;
L
Linus Torvalds 已提交
919 920 921

	if (tdev == dev) {
		ip_rt_put(rt);
E
Eric Dumazet 已提交
922
		dev->stats.collisions++;
L
Linus Torvalds 已提交
923 924 925
		goto tx_error;
	}

926
	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) {
927
		ip_rt_put(rt);
928
		goto tx_error;
929 930
	}

931
	if (df) {
932
		mtu = dst_mtu(&rt->dst) - t_hlen;
L
Linus Torvalds 已提交
933

934
		if (mtu < 68) {
E
Eric Dumazet 已提交
935
			dev->stats.collisions++;
936 937 938
			ip_rt_put(rt);
			goto tx_error;
		}
L
Linus Torvalds 已提交
939

940 941 942 943 944
		if (mtu < IPV6_MIN_MTU) {
			mtu = IPV6_MIN_MTU;
			df = 0;
		}

945 946
		if (tunnel->parms.iph.daddr)
			skb_dst_update_pmtu(skb, mtu);
947

948
		if (skb->len > mtu && !skb_is_gso(skb)) {
949
			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
950 951 952
			ip_rt_put(rt);
			goto tx_error;
		}
L
Linus Torvalds 已提交
953 954 955
	}

	if (tunnel->err_count > 0) {
956 957
		if (time_before(jiffies,
				tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
L
Linus Torvalds 已提交
958 959 960 961 962 963 964 965 966
			tunnel->err_count--;
			dst_link_failure(skb);
		} else
			tunnel->err_count = 0;
	}

	/*
	 * Okay, now see if we can stuff it in the buffer as-is.
	 */
967
	max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen;
L
Linus Torvalds 已提交
968

969 970
	if (skb_headroom(skb) < max_headroom || skb_shared(skb) ||
	    (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
L
Linus Torvalds 已提交
971 972 973
		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
		if (!new_skb) {
			ip_rt_put(rt);
E
Eric Dumazet 已提交
974
			dev->stats.tx_dropped++;
975
			kfree_skb(skb);
976
			return NETDEV_TX_OK;
L
Linus Torvalds 已提交
977 978 979 980 981
		}
		if (skb->sk)
			skb_set_owner_w(new_skb, skb->sk);
		dev_kfree_skb(skb);
		skb = new_skb;
982
		iph6 = ipv6_hdr(skb);
L
Linus Torvalds 已提交
983
	}
984 985 986 987
	ttl = tiph->ttl;
	if (ttl == 0)
		ttl = iph6->hop_limit;
	tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
L
Linus Torvalds 已提交
988

989
	if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) {
990
		ip_rt_put(rt);
991
		goto tx_error;
992
	}
993

T
Tom Herbert 已提交
994 995
	skb_set_inner_ipproto(skb, IPPROTO_IPV6);

996 997
	iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
		      df, !net_eq(tunnel->net, dev_net(dev)));
998
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
999 1000 1001 1002

tx_error_icmp:
	dst_link_failure(skb);
tx_error:
1003
	kfree_skb(skb);
E
Eric Dumazet 已提交
1004
	dev->stats.tx_errors++;
1005
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
1006 1007
}

S
Simon Horman 已提交
1008 1009
static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb,
				     struct net_device *dev, u8 ipproto)
1010 1011 1012 1013
{
	struct ip_tunnel *tunnel = netdev_priv(dev);
	const struct iphdr  *tiph = &tunnel->parms.iph;

1014
	if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4))
1015
		goto tx_error;
1016

S
Simon Horman 已提交
1017
	skb_set_inner_ipproto(skb, ipproto);
T
Tom Herbert 已提交
1018

S
Simon Horman 已提交
1019
	ip_tunnel_xmit(skb, dev, tiph, ipproto);
1020
	return NETDEV_TX_OK;
1021 1022
tx_error:
	kfree_skb(skb);
E
Eric Dumazet 已提交
1023 1024
	dev->stats.tx_errors++;
	return NETDEV_TX_OK;
1025 1026 1027 1028 1029
}

static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb,
				   struct net_device *dev)
{
1030 1031 1032
	if (!pskb_inet_may_pull(skb))
		goto tx_err;

1033 1034
	switch (skb->protocol) {
	case htons(ETH_P_IP):
S
Simon Horman 已提交
1035
		sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP);
1036 1037 1038 1039
		break;
	case htons(ETH_P_IPV6):
		ipip6_tunnel_xmit(skb, dev);
		break;
S
Simon Horman 已提交
1040 1041 1042 1043 1044
#if IS_ENABLED(CONFIG_MPLS)
	case htons(ETH_P_MPLS_UC):
		sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS);
		break;
#endif
1045 1046 1047 1048 1049 1050 1051 1052
	default:
		goto tx_err;
	}

	return NETDEV_TX_OK;

tx_err:
	dev->stats.tx_errors++;
1053
	kfree_skb(skb);
1054 1055 1056 1057
	return NETDEV_TX_OK;

}

1058 1059 1060 1061
static void ipip6_tunnel_bind_dev(struct net_device *dev)
{
	struct net_device *tdev = NULL;
	struct ip_tunnel *tunnel;
1062
	const struct iphdr *iph;
1063
	struct flowi4 fl4;
1064 1065 1066 1067 1068

	tunnel = netdev_priv(dev);
	iph = &tunnel->parms.iph;

	if (iph->daddr) {
N
Nicolas Dichtel 已提交
1069 1070
		struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4,
							  NULL,
1071 1072 1073 1074 1075
							  iph->daddr, iph->saddr,
							  0, 0,
							  IPPROTO_IPV6,
							  RT_TOS(iph->tos),
							  tunnel->parms.link);
1076 1077

		if (!IS_ERR(rt)) {
1078
			tdev = rt->dst.dev;
1079 1080 1081 1082 1083 1084
			ip_rt_put(rt);
		}
		dev->flags |= IFF_POINTOPOINT;
	}

	if (!tdev && tunnel->parms.link)
N
Nicolas Dichtel 已提交
1085
		tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link);
1086

1087
	if (tdev && !netif_is_l3_master(tdev)) {
1088 1089
		int t_hlen = tunnel->hlen + sizeof(struct iphdr);

1090
		dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr);
1091
		dev->mtu = tdev->mtu - t_hlen;
1092 1093 1094 1095 1096
		if (dev->mtu < IPV6_MIN_MTU)
			dev->mtu = IPV6_MIN_MTU;
	}
}

1097 1098
static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p,
				__u32 fwmark)
1099
{
N
Nicolas Dichtel 已提交
1100
	struct net *net = t->net;
1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
	struct sit_net *sitn = net_generic(net, sit_net_id);

	ipip6_tunnel_unlink(sitn, t);
	synchronize_net();
	t->parms.iph.saddr = p->iph.saddr;
	t->parms.iph.daddr = p->iph.daddr;
	memcpy(t->dev->dev_addr, &p->iph.saddr, 4);
	memcpy(t->dev->broadcast, &p->iph.daddr, 4);
	ipip6_tunnel_link(sitn, t);
	t->parms.iph.ttl = p->iph.ttl;
	t->parms.iph.tos = p->iph.tos;
H
Hangbin Liu 已提交
1112
	t->parms.iph.frag_off = p->iph.frag_off;
1113
	if (t->parms.link != p->link || t->fwmark != fwmark) {
1114
		t->parms.link = p->link;
1115
		t->fwmark = fwmark;
1116 1117
		ipip6_tunnel_bind_dev(t->dev);
	}
1118
	dst_cache_reset(&t->dst_cache);
1119 1120 1121
	netdev_state_change(t->dev);
}

1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
#ifdef CONFIG_IPV6_SIT_6RD
static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
				   struct ip_tunnel_6rd *ip6rd)
{
	struct in6_addr prefix;
	__be32 relay_prefix;

	if (ip6rd->relay_prefixlen > 32 ||
	    ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64)
		return -EINVAL;

	ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen);
	if (!ipv6_addr_equal(&prefix, &ip6rd->prefix))
		return -EINVAL;
	if (ip6rd->relay_prefixlen)
		relay_prefix = ip6rd->relay_prefix &
			       htonl(0xffffffffUL <<
				     (32 - ip6rd->relay_prefixlen));
	else
		relay_prefix = 0;
	if (relay_prefix != ip6rd->relay_prefix)
		return -EINVAL;

	t->ip6rd.prefix = prefix;
	t->ip6rd.relay_prefix = relay_prefix;
	t->ip6rd.prefixlen = ip6rd->prefixlen;
	t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
1149
	dst_cache_reset(&t->dst_cache);
1150 1151 1152 1153 1154
	netdev_state_change(t->dev);
	return 0;
}
#endif

1155
static bool ipip6_valid_ip_proto(u8 ipproto)
S
Simon Horman 已提交
1156 1157 1158 1159 1160 1161 1162 1163 1164
{
	return ipproto == IPPROTO_IPV6 ||
		ipproto == IPPROTO_IPIP ||
#if IS_ENABLED(CONFIG_MPLS)
		ipproto == IPPROTO_MPLS ||
#endif
		ipproto == 0;
}

L
Linus Torvalds 已提交
1165
static int
1166
ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
L
Linus Torvalds 已提交
1167 1168 1169
{
	int err = 0;
	struct ip_tunnel_parm p;
1170
	struct ip_tunnel_prl prl;
1171 1172
	struct ip_tunnel *t = netdev_priv(dev);
	struct net *net = t->net;
1173
	struct sit_net *sitn = net_generic(net, sit_net_id);
1174 1175 1176
#ifdef CONFIG_IPV6_SIT_6RD
	struct ip_tunnel_6rd ip6rd;
#endif
L
Linus Torvalds 已提交
1177 1178 1179

	switch (cmd) {
	case SIOCGETTUNNEL:
1180 1181 1182
#ifdef CONFIG_IPV6_SIT_6RD
	case SIOCGET6RD:
#endif
1183
		if (dev == sitn->fb_tunnel_dev) {
L
Linus Torvalds 已提交
1184 1185 1186 1187
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
				err = -EFAULT;
				break;
			}
1188
			t = ipip6_tunnel_locate(net, &p, 0);
1189
			if (!t)
1190
				t = netdev_priv(dev);
L
Linus Torvalds 已提交
1191
		}
1192 1193 1194 1195 1196 1197 1198 1199 1200

		err = -EFAULT;
		if (cmd == SIOCGETTUNNEL) {
			memcpy(&p, &t->parms, sizeof(p));
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &p,
					 sizeof(p)))
				goto done;
#ifdef CONFIG_IPV6_SIT_6RD
		} else {
A
Alexey Dobriyan 已提交
1201
			ip6rd.prefix = t->ip6rd.prefix;
1202 1203 1204 1205 1206 1207 1208 1209 1210
			ip6rd.relay_prefix = t->ip6rd.relay_prefix;
			ip6rd.prefixlen = t->ip6rd.prefixlen;
			ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen;
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &ip6rd,
					 sizeof(ip6rd)))
				goto done;
#endif
		}
		err = 0;
L
Linus Torvalds 已提交
1211 1212 1213 1214 1215
		break;

	case SIOCADDTUNNEL:
	case SIOCCHGTUNNEL:
		err = -EPERM;
1216
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
1217 1218 1219 1220 1221 1222 1223
			goto done;

		err = -EFAULT;
		if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
			goto done;

		err = -EINVAL;
S
Simon Horman 已提交
1224
		if (!ipip6_valid_ip_proto(p.iph.protocol))
1225 1226
			goto done;
		if (p.iph.version != 4 ||
L
Linus Torvalds 已提交
1227 1228 1229 1230 1231
		    p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)))
			goto done;
		if (p.iph.ttl)
			p.iph.frag_off |= htons(IP_DF);

1232
		t = ipip6_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
L
Linus Torvalds 已提交
1233

1234
		if (dev != sitn->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1235
			if (t) {
L
Linus Torvalds 已提交
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
				if (t->dev != dev) {
					err = -EEXIST;
					break;
				}
			} else {
				if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) ||
				    (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) {
					err = -EINVAL;
					break;
				}
1246
				t = netdev_priv(dev);
L
Linus Torvalds 已提交
1247
			}
1248

1249
			ipip6_tunnel_update(t, &p, t->fwmark);
L
Linus Torvalds 已提交
1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
		}

		if (t) {
			err = 0;
			if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
				err = -EFAULT;
		} else
			err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
		break;

	case SIOCDELTUNNEL:
		err = -EPERM;
1262
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
1263 1264
			goto done;

1265
		if (dev == sitn->fb_tunnel_dev) {
L
Linus Torvalds 已提交
1266 1267 1268 1269
			err = -EFAULT;
			if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
				goto done;
			err = -ENOENT;
1270
			t = ipip6_tunnel_locate(net, &p, 0);
1271
			if (!t)
L
Linus Torvalds 已提交
1272 1273
				goto done;
			err = -EPERM;
1274
			if (t == netdev_priv(sitn->fb_tunnel_dev))
L
Linus Torvalds 已提交
1275 1276 1277
				goto done;
			dev = t->dev;
		}
1278 1279
		unregister_netdevice(dev);
		err = 0;
L
Linus Torvalds 已提交
1280 1281
		break;

1282
	case SIOCGETPRL:
1283 1284 1285 1286 1287 1288
		err = -EINVAL;
		if (dev == sitn->fb_tunnel_dev)
			goto done;
		err = ipip6_tunnel_get_prl(t, ifr->ifr_ifru.ifru_data);
		break;

1289 1290 1291 1292
	case SIOCADDPRL:
	case SIOCDELPRL:
	case SIOCCHGPRL:
		err = -EPERM;
1293
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1294 1295
			goto done;
		err = -EINVAL;
1296
		if (dev == sitn->fb_tunnel_dev)
1297 1298 1299 1300 1301
			goto done;
		err = -EFAULT;
		if (copy_from_user(&prl, ifr->ifr_ifru.ifru_data, sizeof(prl)))
			goto done;

1302 1303
		switch (cmd) {
		case SIOCDELPRL:
1304
			err = ipip6_tunnel_del_prl(t, &prl);
1305 1306 1307
			break;
		case SIOCADDPRL:
		case SIOCCHGPRL:
1308
			err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
1309 1310
			break;
		}
1311
		dst_cache_reset(&t->dst_cache);
1312
		netdev_state_change(dev);
1313 1314
		break;

1315 1316 1317 1318 1319
#ifdef CONFIG_IPV6_SIT_6RD
	case SIOCADD6RD:
	case SIOCCHG6RD:
	case SIOCDEL6RD:
		err = -EPERM;
1320
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1321 1322 1323 1324 1325 1326 1327 1328
			goto done;

		err = -EFAULT;
		if (copy_from_user(&ip6rd, ifr->ifr_ifru.ifru_data,
				   sizeof(ip6rd)))
			goto done;

		if (cmd != SIOCDEL6RD) {
1329 1330
			err = ipip6_tunnel_update_6rd(t, &ip6rd);
			if (err < 0)
1331 1332
				goto done;
		} else
1333
			ipip6_tunnel_clone_6rd(dev, sitn);
1334 1335 1336 1337 1338

		err = 0;
		break;
#endif

L
Linus Torvalds 已提交
1339 1340 1341 1342 1343 1344 1345 1346
	default:
		err = -EINVAL;
	}

done:
	return err;
}

1347
static const struct net_device_ops ipip6_netdev_ops = {
1348
	.ndo_init	= ipip6_tunnel_init,
1349
	.ndo_uninit	= ipip6_tunnel_uninit,
1350
	.ndo_start_xmit	= sit_tunnel_xmit,
1351
	.ndo_do_ioctl	= ipip6_tunnel_ioctl,
1352
	.ndo_get_stats64 = ip_tunnel_get_stats64,
1353
	.ndo_get_iflink = ip_tunnel_get_iflink,
1354 1355
};

E
Eric Dumazet 已提交
1356 1357
static void ipip6_dev_free(struct net_device *dev)
{
1358 1359
	struct ip_tunnel *tunnel = netdev_priv(dev);

1360
	dst_cache_destroy(&tunnel->dst_cache);
E
Eric Dumazet 已提交
1361 1362 1363
	free_percpu(dev->tstats);
}

E
Eric Dumazet 已提交
1364 1365 1366 1367 1368 1369
#define SIT_FEATURES (NETIF_F_SG	   | \
		      NETIF_F_FRAGLIST	   | \
		      NETIF_F_HIGHDMA	   | \
		      NETIF_F_GSO_SOFTWARE | \
		      NETIF_F_HW_CSUM)

L
Linus Torvalds 已提交
1370 1371
static void ipip6_tunnel_setup(struct net_device *dev)
{
1372 1373 1374
	struct ip_tunnel *tunnel = netdev_priv(dev);
	int t_hlen = tunnel->hlen + sizeof(struct iphdr);

1375
	dev->netdev_ops		= &ipip6_netdev_ops;
1376 1377
	dev->needs_free_netdev	= true;
	dev->priv_destructor	= ipip6_dev_free;
L
Linus Torvalds 已提交
1378 1379

	dev->type		= ARPHRD_SIT;
1380 1381
	dev->hard_header_len	= LL_MAX_HEADER + t_hlen;
	dev->mtu		= ETH_DATA_LEN - t_hlen;
1382
	dev->min_mtu		= IPV6_MIN_MTU;
1383
	dev->max_mtu		= IP6_MAX_MTU - t_hlen;
L
Linus Torvalds 已提交
1384
	dev->flags		= IFF_NOARP;
1385
	netif_keep_dst(dev);
L
Linus Torvalds 已提交
1386
	dev->addr_len		= 4;
E
Eric Dumazet 已提交
1387
	dev->features		|= NETIF_F_LLTX;
E
Eric Dumazet 已提交
1388 1389
	dev->features		|= SIT_FEATURES;
	dev->hw_features	|= SIT_FEATURES;
L
Linus Torvalds 已提交
1390 1391
}

E
Eric Dumazet 已提交
1392
static int ipip6_tunnel_init(struct net_device *dev)
L
Linus Torvalds 已提交
1393
{
1394
	struct ip_tunnel *tunnel = netdev_priv(dev);
1395
	int err;
L
Linus Torvalds 已提交
1396 1397

	tunnel->dev = dev;
N
Nicolas Dichtel 已提交
1398
	tunnel->net = dev_net(dev);
1399
	strcpy(tunnel->parms.name, dev->name);
L
Linus Torvalds 已提交
1400

1401
	ipip6_tunnel_bind_dev(dev);
1402
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
E
Eric Dumazet 已提交
1403 1404 1405
	if (!dev->tstats)
		return -ENOMEM;

1406 1407
	err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
	if (err) {
1408
		free_percpu(dev->tstats);
W
WANG Cong 已提交
1409
		dev->tstats = NULL;
1410
		return err;
1411 1412
	}

E
Eric Dumazet 已提交
1413
	return 0;
L
Linus Torvalds 已提交
1414 1415
}

1416
static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
L
Linus Torvalds 已提交
1417
{
1418
	struct ip_tunnel *tunnel = netdev_priv(dev);
L
Linus Torvalds 已提交
1419
	struct iphdr *iph = &tunnel->parms.iph;
1420 1421
	struct net *net = dev_net(dev);
	struct sit_net *sitn = net_generic(net, sit_net_id);
L
Linus Torvalds 已提交
1422 1423 1424 1425 1426 1427 1428

	iph->version		= 4;
	iph->protocol		= IPPROTO_IPV6;
	iph->ihl		= 5;
	iph->ttl		= 64;

	dev_hold(dev);
1429
	rcu_assign_pointer(sitn->tunnels_wc[0], tunnel);
L
Linus Torvalds 已提交
1430 1431
}

1432 1433
static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
1434 1435 1436
{
	u8 proto;

1437
	if (!data || !data[IFLA_IPTUN_PROTO])
1438 1439 1440
		return 0;

	proto = nla_get_u8(data[IFLA_IPTUN_PROTO]);
S
Simon Horman 已提交
1441
	if (!ipip6_valid_ip_proto(proto))
1442 1443 1444 1445 1446
		return -EINVAL;

	return 0;
}

1447
static void ipip6_netlink_parms(struct nlattr *data[],
1448 1449
				struct ip_tunnel_parm *parms,
				__u32 *fwmark)
1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464
{
	memset(parms, 0, sizeof(*parms));

	parms->iph.version = 4;
	parms->iph.protocol = IPPROTO_IPV6;
	parms->iph.ihl = 5;
	parms->iph.ttl = 64;

	if (!data)
		return;

	if (data[IFLA_IPTUN_LINK])
		parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]);

	if (data[IFLA_IPTUN_LOCAL])
N
Nicolas Dichtel 已提交
1465
		parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]);
1466 1467

	if (data[IFLA_IPTUN_REMOTE])
N
Nicolas Dichtel 已提交
1468
		parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]);
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482

	if (data[IFLA_IPTUN_TTL]) {
		parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]);
		if (parms->iph.ttl)
			parms->iph.frag_off = htons(IP_DF);
	}

	if (data[IFLA_IPTUN_TOS])
		parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]);

	if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC]))
		parms->iph.frag_off = htons(IP_DF);

	if (data[IFLA_IPTUN_FLAGS])
N
Nicolas Dichtel 已提交
1483
		parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]);
1484 1485 1486 1487

	if (data[IFLA_IPTUN_PROTO])
		parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]);

1488 1489
	if (data[IFLA_IPTUN_FWMARK])
		*fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]);
1490 1491
}

1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514
/* This function returns true when ENCAP attributes are present in the nl msg */
static bool ipip6_netlink_encap_parms(struct nlattr *data[],
				      struct ip_tunnel_encap *ipencap)
{
	bool ret = false;

	memset(ipencap, 0, sizeof(*ipencap));

	if (!data)
		return ret;

	if (data[IFLA_IPTUN_ENCAP_TYPE]) {
		ret = true;
		ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]);
	}

	if (data[IFLA_IPTUN_ENCAP_FLAGS]) {
		ret = true;
		ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]);
	}

	if (data[IFLA_IPTUN_ENCAP_SPORT]) {
		ret = true;
1515
		ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]);
1516 1517 1518 1519
	}

	if (data[IFLA_IPTUN_ENCAP_DPORT]) {
		ret = true;
1520
		ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]);
1521 1522 1523 1524 1525
	}

	return ret;
}

1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
#ifdef CONFIG_IPV6_SIT_6RD
/* This function returns true when 6RD attributes are present in the nl msg */
static bool ipip6_netlink_6rd_parms(struct nlattr *data[],
				    struct ip_tunnel_6rd *ip6rd)
{
	bool ret = false;
	memset(ip6rd, 0, sizeof(*ip6rd));

	if (!data)
		return ret;

	if (data[IFLA_IPTUN_6RD_PREFIX]) {
		ret = true;
1539
		ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]);
1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562
	}

	if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) {
		ret = true;
		ip6rd->relay_prefix =
			nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]);
	}

	if (data[IFLA_IPTUN_6RD_PREFIXLEN]) {
		ret = true;
		ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]);
	}

	if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) {
		ret = true;
		ip6rd->relay_prefixlen =
			nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]);
	}

	return ret;
}
#endif

1563
static int ipip6_newlink(struct net *src_net, struct net_device *dev,
1564 1565
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
1566 1567 1568
{
	struct net *net = dev_net(dev);
	struct ip_tunnel *nt;
1569
	struct ip_tunnel_encap ipencap;
1570 1571 1572 1573
#ifdef CONFIG_IPV6_SIT_6RD
	struct ip_tunnel_6rd ip6rd;
#endif
	int err;
1574 1575

	nt = netdev_priv(dev);
1576 1577 1578 1579 1580 1581 1582

	if (ipip6_netlink_encap_parms(data, &ipencap)) {
		err = ip_tunnel_encap_setup(nt, &ipencap);
		if (err < 0)
			return err;
	}

1583
	ipip6_netlink_parms(data, &nt->parms, &nt->fwmark);
1584 1585 1586 1587

	if (ipip6_tunnel_locate(net, &nt->parms, 0))
		return -EEXIST;

1588 1589 1590 1591
	err = ipip6_tunnel_create(dev);
	if (err < 0)
		return err;

X
Xin Long 已提交
1592 1593 1594
	if (tb[IFLA_MTU]) {
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);

1595 1596
		if (mtu >= IPV6_MIN_MTU &&
		    mtu <= IP6_MAX_MTU - dev->hard_header_len)
X
Xin Long 已提交
1597 1598 1599
			dev->mtu = mtu;
	}

1600 1601 1602 1603 1604 1605
#ifdef CONFIG_IPV6_SIT_6RD
	if (ipip6_netlink_6rd_parms(data, &ip6rd))
		err = ipip6_tunnel_update_6rd(nt, &ip6rd);
#endif

	return err;
1606 1607 1608
}

static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[],
1609 1610
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
1611
{
1612
	struct ip_tunnel *t = netdev_priv(dev);
1613
	struct ip_tunnel_parm p;
1614
	struct ip_tunnel_encap ipencap;
1615
	struct net *net = t->net;
1616
	struct sit_net *sitn = net_generic(net, sit_net_id);
1617 1618 1619
#ifdef CONFIG_IPV6_SIT_6RD
	struct ip_tunnel_6rd ip6rd;
#endif
1620
	__u32 fwmark = t->fwmark;
1621
	int err;
1622 1623 1624 1625

	if (dev == sitn->fb_tunnel_dev)
		return -EINVAL;

1626 1627 1628 1629 1630 1631
	if (ipip6_netlink_encap_parms(data, &ipencap)) {
		err = ip_tunnel_encap_setup(t, &ipencap);
		if (err < 0)
			return err;
	}

1632
	ipip6_netlink_parms(data, &p, &fwmark);
1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645

	if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) ||
	    (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr))
		return -EINVAL;

	t = ipip6_tunnel_locate(net, &p, 0);

	if (t) {
		if (t->dev != dev)
			return -EEXIST;
	} else
		t = netdev_priv(dev);

1646
	ipip6_tunnel_update(t, &p, fwmark);
1647 1648 1649 1650 1651 1652

#ifdef CONFIG_IPV6_SIT_6RD
	if (ipip6_netlink_6rd_parms(data, &ip6rd))
		return ipip6_tunnel_update_6rd(t, &ip6rd);
#endif

1653 1654 1655
	return 0;
}

1656
static size_t ipip6_get_size(const struct net_device *dev)
1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668
{
	return
		/* IFLA_IPTUN_LINK */
		nla_total_size(4) +
		/* IFLA_IPTUN_LOCAL */
		nla_total_size(4) +
		/* IFLA_IPTUN_REMOTE */
		nla_total_size(4) +
		/* IFLA_IPTUN_TTL */
		nla_total_size(1) +
		/* IFLA_IPTUN_TOS */
		nla_total_size(1) +
1669 1670 1671 1672
		/* IFLA_IPTUN_PMTUDISC */
		nla_total_size(1) +
		/* IFLA_IPTUN_FLAGS */
		nla_total_size(2) +
1673 1674
		/* IFLA_IPTUN_PROTO */
		nla_total_size(1) +
1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
#ifdef CONFIG_IPV6_SIT_6RD
		/* IFLA_IPTUN_6RD_PREFIX */
		nla_total_size(sizeof(struct in6_addr)) +
		/* IFLA_IPTUN_6RD_RELAY_PREFIX */
		nla_total_size(4) +
		/* IFLA_IPTUN_6RD_PREFIXLEN */
		nla_total_size(2) +
		/* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */
		nla_total_size(2) +
#endif
1685 1686 1687 1688 1689 1690 1691 1692
		/* IFLA_IPTUN_ENCAP_TYPE */
		nla_total_size(2) +
		/* IFLA_IPTUN_ENCAP_FLAGS */
		nla_total_size(2) +
		/* IFLA_IPTUN_ENCAP_SPORT */
		nla_total_size(2) +
		/* IFLA_IPTUN_ENCAP_DPORT */
		nla_total_size(2) +
1693 1694
		/* IFLA_IPTUN_FWMARK */
		nla_total_size(4) +
1695 1696 1697
		0;
}

1698
static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev)
1699 1700 1701 1702 1703
{
	struct ip_tunnel *tunnel = netdev_priv(dev);
	struct ip_tunnel_parm *parm = &tunnel->parms;

	if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) ||
1704 1705
	    nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) ||
	    nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) ||
1706
	    nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) ||
1707 1708 1709
	    nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) ||
	    nla_put_u8(skb, IFLA_IPTUN_PMTUDISC,
		       !!(parm->iph.frag_off & htons(IP_DF))) ||
1710
	    nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) ||
1711 1712
	    nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) ||
	    nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark))
1713
		goto nla_put_failure;
1714 1715

#ifdef CONFIG_IPV6_SIT_6RD
1716 1717 1718 1719
	if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX,
			     &tunnel->ip6rd.prefix) ||
	    nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX,
			    tunnel->ip6rd.relay_prefix) ||
1720 1721 1722 1723 1724 1725 1726
	    nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN,
			tunnel->ip6rd.prefixlen) ||
	    nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN,
			tunnel->ip6rd.relay_prefixlen))
		goto nla_put_failure;
#endif

1727 1728
	if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE,
			tunnel->encap.type) ||
1729
	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT,
1730
			tunnel->encap.sport) ||
1731
	    nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT,
1732 1733
			tunnel->encap.dport) ||
	    nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS,
1734
			tunnel->encap.flags))
1735 1736
		goto nla_put_failure;

1737 1738 1739 1740 1741 1742
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

1743 1744 1745 1746 1747 1748 1749 1750
static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = {
	[IFLA_IPTUN_LINK]		= { .type = NLA_U32 },
	[IFLA_IPTUN_LOCAL]		= { .type = NLA_U32 },
	[IFLA_IPTUN_REMOTE]		= { .type = NLA_U32 },
	[IFLA_IPTUN_TTL]		= { .type = NLA_U8 },
	[IFLA_IPTUN_TOS]		= { .type = NLA_U8 },
	[IFLA_IPTUN_PMTUDISC]		= { .type = NLA_U8 },
	[IFLA_IPTUN_FLAGS]		= { .type = NLA_U16 },
1751
	[IFLA_IPTUN_PROTO]		= { .type = NLA_U8 },
1752 1753 1754 1755 1756 1757
#ifdef CONFIG_IPV6_SIT_6RD
	[IFLA_IPTUN_6RD_PREFIX]		= { .len = sizeof(struct in6_addr) },
	[IFLA_IPTUN_6RD_RELAY_PREFIX]	= { .type = NLA_U32 },
	[IFLA_IPTUN_6RD_PREFIXLEN]	= { .type = NLA_U16 },
	[IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 },
#endif
1758 1759 1760 1761
	[IFLA_IPTUN_ENCAP_TYPE]		= { .type = NLA_U16 },
	[IFLA_IPTUN_ENCAP_FLAGS]	= { .type = NLA_U16 },
	[IFLA_IPTUN_ENCAP_SPORT]	= { .type = NLA_U16 },
	[IFLA_IPTUN_ENCAP_DPORT]	= { .type = NLA_U16 },
1762
	[IFLA_IPTUN_FWMARK]		= { .type = NLA_U32 },
1763 1764
};

1765 1766 1767 1768 1769 1770 1771 1772 1773
static void ipip6_dellink(struct net_device *dev, struct list_head *head)
{
	struct net *net = dev_net(dev);
	struct sit_net *sitn = net_generic(net, sit_net_id);

	if (dev != sitn->fb_tunnel_dev)
		unregister_netdevice_queue(dev, head);
}

1774 1775 1776
static struct rtnl_link_ops sit_link_ops __read_mostly = {
	.kind		= "sit",
	.maxtype	= IFLA_IPTUN_MAX,
1777
	.policy		= ipip6_policy,
1778
	.priv_size	= sizeof(struct ip_tunnel),
1779
	.setup		= ipip6_tunnel_setup,
1780
	.validate	= ipip6_validate,
1781 1782
	.newlink	= ipip6_newlink,
	.changelink	= ipip6_changelink,
1783 1784
	.get_size	= ipip6_get_size,
	.fill_info	= ipip6_fill_info,
1785
	.dellink	= ipip6_dellink,
1786
	.get_link_net	= ip_tunnel_get_link_net,
1787 1788
};

1789
static struct xfrm_tunnel sit_handler __read_mostly = {
L
Linus Torvalds 已提交
1790 1791
	.handler	=	ipip6_rcv,
	.err_handler	=	ipip6_err,
1792
	.priority	=	1,
L
Linus Torvalds 已提交
1793 1794
};

1795 1796 1797 1798 1799 1800
static struct xfrm_tunnel ipip_handler __read_mostly = {
	.handler	=	ipip_rcv,
	.err_handler	=	ipip6_err,
	.priority	=	2,
};

S
Simon Horman 已提交
1801 1802 1803 1804 1805 1806 1807 1808
#if IS_ENABLED(CONFIG_MPLS)
static struct xfrm_tunnel mplsip_handler __read_mostly = {
	.handler	=	mplsip_rcv,
	.err_handler	=	ipip6_err,
	.priority	=	2,
};
#endif

1809 1810
static void __net_exit sit_destroy_tunnels(struct net *net,
					   struct list_head *head)
1811
{
1812
	struct sit_net *sitn = net_generic(net, sit_net_id);
N
Nicolas Dichtel 已提交
1813
	struct net_device *dev, *aux;
1814 1815
	int prio;

N
Nicolas Dichtel 已提交
1816 1817 1818 1819
	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &sit_link_ops)
			unregister_netdevice_queue(dev, head);

1820 1821
	for (prio = 1; prio < 4; prio++) {
		int h;
1822
		for (h = 0; h < IP6_SIT_HASH_SIZE; h++) {
1823
			struct ip_tunnel *t;
1824

1825
			t = rtnl_dereference(sitn->tunnels[prio][h]);
1826
			while (t) {
N
Nicolas Dichtel 已提交
1827 1828 1829
				/* If dev is in the same netns, it has already
				 * been added to the list by the previous loop.
				 */
1830
				if (!net_eq(dev_net(t->dev), net))
N
Nicolas Dichtel 已提交
1831 1832
					unregister_netdevice_queue(t->dev,
								   head);
1833
				t = rtnl_dereference(t->next);
1834
			}
1835 1836 1837 1838
		}
	}
}

1839
static int __net_init sit_init_net(struct net *net)
1840
{
1841
	struct sit_net *sitn = net_generic(net, sit_net_id);
1842
	struct ip_tunnel *t;
1843 1844
	int err;

1845 1846 1847 1848 1849
	sitn->tunnels[0] = sitn->tunnels_wc;
	sitn->tunnels[1] = sitn->tunnels_l;
	sitn->tunnels[2] = sitn->tunnels_r;
	sitn->tunnels[3] = sitn->tunnels_r_l;

1850 1851 1852
	if (!net_has_fallback_tunnels(net))
		return 0;

1853
	sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
1854
					   NET_NAME_UNKNOWN,
1855 1856 1857 1858 1859
					   ipip6_tunnel_setup);
	if (!sitn->fb_tunnel_dev) {
		err = -ENOMEM;
		goto err_alloc_dev;
	}
1860
	dev_net_set(sitn->fb_tunnel_dev, net);
1861
	sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops;
N
Nicolas Dichtel 已提交
1862 1863 1864 1865
	/* FB netdevice is special: we have one, and only one per netns.
	 * Allowing to move it to another netns is clearly unsafe.
	 */
	sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
1866

1867 1868
	err = register_netdev(sitn->fb_tunnel_dev);
	if (err)
1869 1870
		goto err_reg_dev;

1871 1872 1873
	ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn);
	ipip6_fb_tunnel_init(sitn->fb_tunnel_dev);

1874 1875 1876
	t = netdev_priv(sitn->fb_tunnel_dev);

	strcpy(t->parms.name, sitn->fb_tunnel_dev->name);
1877 1878
	return 0;

1879
err_reg_dev:
E
Eric Dumazet 已提交
1880
	ipip6_dev_free(sitn->fb_tunnel_dev);
1881
	free_netdev(sitn->fb_tunnel_dev);
1882
err_alloc_dev:
1883 1884 1885
	return err;
}

1886
static void __net_exit sit_exit_batch_net(struct list_head *net_list)
1887
{
1888
	LIST_HEAD(list);
1889
	struct net *net;
1890

1891
	rtnl_lock();
1892 1893 1894
	list_for_each_entry(net, net_list, exit_list)
		sit_destroy_tunnels(net, &list);

1895
	unregister_netdevice_many(&list);
1896
	rtnl_unlock();
1897 1898 1899 1900
}

static struct pernet_operations sit_net_ops = {
	.init = sit_init_net,
1901
	.exit_batch = sit_exit_batch_net,
1902 1903
	.id   = &sit_net_id,
	.size = sizeof(struct sit_net),
1904 1905
};

1906
static void __exit sit_cleanup(void)
L
Linus Torvalds 已提交
1907
{
1908
	rtnl_link_unregister(&sit_link_ops);
1909
	xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
1910
	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
S
Simon Horman 已提交
1911 1912 1913
#if IS_ENABLED(CONFIG_MPLS)
	xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
#endif
1914

1915
	unregister_pernet_device(&sit_net_ops);
E
Eric Dumazet 已提交
1916
	rcu_barrier(); /* Wait for completion of call_rcu()'s */
L
Linus Torvalds 已提交
1917 1918
}

1919
static int __init sit_init(void)
L
Linus Torvalds 已提交
1920 1921 1922
{
	int err;

S
Simon Horman 已提交
1923
	pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n");
L
Linus Torvalds 已提交
1924

1925
	err = register_pernet_device(&sit_net_ops);
1926
	if (err < 0)
1927 1928 1929
		return err;
	err = xfrm4_tunnel_register(&sit_handler, AF_INET6);
	if (err < 0) {
1930
		pr_info("%s: can't register ip6ip4\n", __func__);
1931
		goto xfrm_tunnel_failed;
1932
	}
1933 1934 1935 1936 1937
	err = xfrm4_tunnel_register(&ipip_handler, AF_INET);
	if (err < 0) {
		pr_info("%s: can't register ip4ip4\n", __func__);
		goto xfrm_tunnel4_failed;
	}
S
Simon Horman 已提交
1938 1939 1940 1941 1942 1943 1944
#if IS_ENABLED(CONFIG_MPLS)
	err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS);
	if (err < 0) {
		pr_info("%s: can't register mplsip\n", __func__);
		goto xfrm_tunnel_mpls_failed;
	}
#endif
1945 1946 1947 1948 1949
	err = rtnl_link_register(&sit_link_ops);
	if (err < 0)
		goto rtnl_link_failed;

out:
L
Linus Torvalds 已提交
1950
	return err;
1951 1952

rtnl_link_failed:
S
Simon Horman 已提交
1953 1954 1955 1956
#if IS_ENABLED(CONFIG_MPLS)
	xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS);
xfrm_tunnel_mpls_failed:
#endif
1957 1958
	xfrm4_tunnel_deregister(&ipip_handler, AF_INET);
xfrm_tunnel4_failed:
1959 1960 1961 1962
	xfrm4_tunnel_deregister(&sit_handler, AF_INET6);
xfrm_tunnel_failed:
	unregister_pernet_device(&sit_net_ops);
	goto out;
L
Linus Torvalds 已提交
1963
}
1964 1965 1966

module_init(sit_init);
module_exit(sit_cleanup);
1967
MODULE_LICENSE("GPL");
1968
MODULE_ALIAS_RTNL_LINK("sit");
1969
MODULE_ALIAS_NETDEV("sit0");