geneve.c 38.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * GENEVE: Generic Network Virtualization Encapsulation
 *
 * Copyright (c) 2015 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/hash.h>
17
#include <net/dst_metadata.h>
18
#include <net/gro_cells.h>
19 20
#include <net/rtnetlink.h>
#include <net/geneve.h>
21
#include <net/protocol.h>
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36

#define GENEVE_NETDEV_VER	"0.6"

#define GENEVE_UDP_PORT		6081

#define GENEVE_N_VID		(1u << 24)
#define GENEVE_VID_MASK		(GENEVE_N_VID - 1)

#define VNI_HASH_BITS		10
#define VNI_HASH_SIZE		(1<<VNI_HASH_BITS)

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

37 38 39
#define GENEVE_VER 0
#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))

40 41
/* per-network namespace private data for this module */
struct geneve_net {
42 43
	struct list_head	geneve_list;
	struct list_head	sock_list;
44 45
};

46
static unsigned int geneve_net_id;
47

48 49 50 51 52
/* Pseudo network device */
struct geneve_dev {
	struct hlist_node  hlist;	/* vni hash table */
	struct net	   *net;	/* netns for packet i/o */
	struct net_device  *dev;	/* netdev for geneve tunnel */
53
	struct ip_tunnel_info info;
54
	struct geneve_sock __rcu *sock4;	/* IPv4 socket used for geneve tunnel */
55
#if IS_ENABLED(CONFIG_IPV6)
56
	struct geneve_sock __rcu *sock6;	/* IPv6 socket used for geneve tunnel */
57
#endif
58
	struct list_head   next;	/* geneve's per namespace list */
59
	struct gro_cells   gro_cells;
60 61
	bool		   collect_md;
	bool		   use_udp6_rx_checksums;
62 63
};

64 65 66 67 68 69
struct geneve_sock {
	bool			collect_md;
	struct list_head	list;
	struct socket		*sock;
	struct rcu_head		rcu;
	int			refcnt;
70
	struct hlist_head	vni_list[VNI_HASH_SIZE];
71
};
72 73 74 75 76 77 78 79 80

static inline __u32 geneve_net_vni_hash(u8 vni[3])
{
	__u32 vnid;

	vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
	return hash_32(vnid, VNI_HASH_BITS);
}

81 82 83 84 85 86 87 88 89 90 91
static __be64 vni_to_tunnel_id(const __u8 *vni)
{
#ifdef __BIG_ENDIAN
	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
#else
	return (__force __be64)(((__force u64)vni[0] << 40) |
				((__force u64)vni[1] << 48) |
				((__force u64)vni[2] << 56));
#endif
}

92 93 94 95 96 97 98 99 100 101 102 103 104 105
/* Convert 64 bit tunnel ID to 24 bit VNI. */
static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
{
#ifdef __BIG_ENDIAN
	vni[0] = (__force __u8)(tun_id >> 16);
	vni[1] = (__force __u8)(tun_id >> 8);
	vni[2] = (__force __u8)tun_id;
#else
	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
#endif
}

106 107 108 109 110
static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
{
	return gs->sock->sk->sk_family;
}

111
static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
112
					__be32 addr, u8 vni[])
113
{
114
	__be64 id = vni_to_tunnel_id(vni);
115
	struct hlist_head *vni_list_head;
116
	struct geneve_dev *geneve;
117 118 119
	__u32 hash;

	/* Find the device for this VNI */
120
	hash = geneve_net_vni_hash(vni);
121
	vni_list_head = &gs->vni_list[hash];
122
	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
123 124
		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
		    addr == geneve->info.key.u.ipv4.dst)
125 126 127 128 129 130 131 132 133
			return geneve;
	}
	return NULL;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
					 struct in6_addr addr6, u8 vni[])
{
134
	__be64 id = vni_to_tunnel_id(vni);
135 136 137 138 139 140 141 142
	struct hlist_head *vni_list_head;
	struct geneve_dev *geneve;
	__u32 hash;

	/* Find the device for this VNI */
	hash = geneve_net_vni_hash(vni);
	vni_list_head = &gs->vni_list[hash];
	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
143 144
		if (!memcmp(&id, &geneve->info.key.tun_id, sizeof(id)) &&
		    ipv6_addr_equal(&addr6, &geneve->info.key.u.ipv6.dst))
145
			return geneve;
146
	}
147 148
	return NULL;
}
149
#endif
150

151 152 153 154 155
static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
{
	return (struct genevehdr *)(udp_hdr(skb) + 1);
}

156 157
static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
					    struct sk_buff *skb)
158
{
159
	static u8 zero_vni[3];
160
	u8 *vni;
161

162
	if (geneve_get_sk_family(gs) == AF_INET) {
163
		struct iphdr *iph;
164
		__be32 addr;
165

166
		iph = ip_hdr(skb); /* outer IP header... */
167

168 169 170 171
		if (gs->collect_md) {
			vni = zero_vni;
			addr = 0;
		} else {
172
			vni = geneve_hdr(skb)->vni;
173 174 175
			addr = iph->saddr;
		}

176
		return geneve_lookup(gs, addr, vni);
177
#if IS_ENABLED(CONFIG_IPV6)
178
	} else if (geneve_get_sk_family(gs) == AF_INET6) {
179
		static struct in6_addr zero_addr6;
180 181 182
		struct ipv6hdr *ip6h;
		struct in6_addr addr6;

183
		ip6h = ipv6_hdr(skb); /* outer IPv6 header... */
184

185 186 187 188
		if (gs->collect_md) {
			vni = zero_vni;
			addr6 = zero_addr6;
		} else {
189
			vni = geneve_hdr(skb)->vni;
190 191 192
			addr6 = ip6h->saddr;
		}

193
		return geneve6_lookup(gs, addr6, vni);
194 195
#endif
	}
196 197 198 199 200 201 202 203 204 205 206 207
	return NULL;
}

/* geneve receive/decap routine */
static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
		      struct sk_buff *skb)
{
	struct genevehdr *gnvh = geneve_hdr(skb);
	struct metadata_dst *tun_dst = NULL;
	struct pcpu_sw_netstats *stats;
	int err = 0;
	void *oiph;
208

209
	if (ip_tunnel_collect_metadata() || gs->collect_md) {
210 211 212 213 214 215
		__be16 flags;

		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
			(gnvh->oam ? TUNNEL_OAM : 0) |
			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);

216
		tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
217 218 219 220 221
					 vni_to_tunnel_id(gnvh->vni),
					 gnvh->opt_len * 4);
		if (!tun_dst)
			goto drop;
		/* Update tunnel dst according to Geneve options. */
222 223
		ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
					gnvh->options, gnvh->opt_len * 4);
224 225 226 227 228 229 230
	} else {
		/* Drop packets w/ critical options,
		 * since we don't support any...
		 */
		if (gnvh->critical)
			goto drop;
	}
231 232 233 234 235

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, geneve->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

236 237 238
	if (tun_dst)
		skb_dst_set(skb, &tun_dst->dst);

239 240 241 242
	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
		goto drop;

243
	oiph = skb_network_header(skb);
244 245
	skb_reset_network_header(skb);

246 247
	if (geneve_get_sk_family(gs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
248
#if IS_ENABLED(CONFIG_IPV6)
249 250
	else
		err = IP6_ECN_decapsulate(oiph, skb);
251
#endif
252 253

	if (unlikely(err)) {
254
		if (log_ecn_error) {
255
			if (geneve_get_sk_family(gs) == AF_INET)
256 257
				net_info_ratelimited("non-ECT from %pI4 "
						     "with TOS=%#x\n",
258 259
						     &((struct iphdr *)oiph)->saddr,
						     ((struct iphdr *)oiph)->tos);
260
#if IS_ENABLED(CONFIG_IPV6)
261
			else
262
				net_info_ratelimited("non-ECT from %pI6\n",
263
						     &((struct ipv6hdr *)oiph)->saddr);
264 265
#endif
		}
266 267 268 269 270 271 272 273 274 275 276 277 278
		if (err > 1) {
			++geneve->dev->stats.rx_frame_errors;
			++geneve->dev->stats.rx_errors;
			goto drop;
		}
	}

	stats = this_cpu_ptr(geneve->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

279
	gro_cells_receive(&geneve->gro_cells, skb);
280 281 282 283 284 285 286 287 288
	return;
drop:
	/* Consume bad packet */
	kfree_skb(skb);
}

/* Setup stats when device is created */
static int geneve_init(struct net_device *dev)
{
289 290 291
	struct geneve_dev *geneve = netdev_priv(dev);
	int err;

292 293 294
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
	if (!dev->tstats)
		return -ENOMEM;
295 296 297 298 299 300 301

	err = gro_cells_init(&geneve->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

302
	err = dst_cache_init(&geneve->info.dst_cache, GFP_KERNEL);
P
Paolo Abeni 已提交
303 304 305 306 307
	if (err) {
		free_percpu(dev->tstats);
		gro_cells_destroy(&geneve->gro_cells);
		return err;
	}
308 309 310 311 312
	return 0;
}

static void geneve_uninit(struct net_device *dev)
{
313 314
	struct geneve_dev *geneve = netdev_priv(dev);

315
	dst_cache_destroy(&geneve->info.dst_cache);
316
	gro_cells_destroy(&geneve->gro_cells);
317 318 319
	free_percpu(dev->tstats);
}

320 321 322 323
/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
	struct genevehdr *geneveh;
324
	struct geneve_dev *geneve;
325 326 327 328 329
	struct geneve_sock *gs;
	int opts_len;

	/* Need Geneve and inner Ethernet header to be present */
	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
330
		goto drop;
331 332 333 334

	/* Return packets with reserved bits set */
	geneveh = geneve_hdr(skb);
	if (unlikely(geneveh->ver != GENEVE_VER))
335
		goto drop;
336 337

	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
338
		goto drop;
339

340 341 342 343 344 345 346 347
	gs = rcu_dereference_sk_user_data(sk);
	if (!gs)
		goto drop;

	geneve = geneve_lookup_skb(gs, skb);
	if (!geneve)
		goto drop;

348 349
	opts_len = geneveh->opt_len * 4;
	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
350 351
				 htons(ETH_P_TEB),
				 !net_eq(geneve->net, dev_net(geneve->dev))))
352 353
		goto drop;

354
	geneve_rx(geneve, gs, skb);
355 356 357 358 359 360 361 362 363
	return 0;

drop:
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
}

static struct socket *geneve_create_sock(struct net *net, bool ipv6,
364
					 __be16 port, bool ipv6_rx_csum)
365 366 367 368 369 370 371 372 373
{
	struct socket *sock;
	struct udp_port_cfg udp_conf;
	int err;

	memset(&udp_conf, 0, sizeof(udp_conf));

	if (ipv6) {
		udp_conf.family = AF_INET6;
374
		udp_conf.ipv6_v6only = 1;
375
		udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395
	} else {
		udp_conf.family = AF_INET;
		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
	}

	udp_conf.local_udp_port = port;

	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);

	return sock;
}

static int geneve_hlen(struct genevehdr *gh)
{
	return sizeof(*gh) + gh->opt_len * 4;
}

396 397 398
static struct sk_buff **geneve_gro_receive(struct sock *sk,
					   struct sk_buff **head,
					   struct sk_buff *skb)
399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
{
	struct sk_buff *p, **pp = NULL;
	struct genevehdr *gh, *gh2;
	unsigned int hlen, gh_len, off_gnv;
	const struct packet_offload *ptype;
	__be16 type;
	int flush = 1;

	off_gnv = skb_gro_offset(skb);
	hlen = off_gnv + sizeof(*gh);
	gh = skb_gro_header_fast(skb, off_gnv);
	if (skb_gro_header_hard(skb, hlen)) {
		gh = skb_gro_header_slow(skb, hlen, off_gnv);
		if (unlikely(!gh))
			goto out;
	}

	if (gh->ver != GENEVE_VER || gh->oam)
		goto out;
	gh_len = geneve_hlen(gh);

	hlen = off_gnv + gh_len;
	if (skb_gro_header_hard(skb, hlen)) {
		gh = skb_gro_header_slow(skb, hlen, off_gnv);
		if (unlikely(!gh))
			goto out;
	}

	for (p = *head; p; p = p->next) {
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		gh2 = (struct genevehdr *)(p->data + off_gnv);
		if (gh->opt_len != gh2->opt_len ||
		    memcmp(gh, gh2, gh_len)) {
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

	type = gh->proto_type;

	rcu_read_lock();
	ptype = gro_find_receive_by_type(type);
443
	if (!ptype)
444 445 446 447
		goto out_unlock;

	skb_gro_pull(skb, gh_len);
	skb_gro_postpull_rcsum(skb, gh, gh_len);
S
Sabrina Dubroca 已提交
448
	pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
449
	flush = 0;
450 451 452 453 454 455 456 457 458

out_unlock:
	rcu_read_unlock();
out:
	NAPI_GRO_CB(skb)->flush |= flush;

	return pp;
}

459 460
static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
			       int nhoff)
461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
{
	struct genevehdr *gh;
	struct packet_offload *ptype;
	__be16 type;
	int gh_len;
	int err = -ENOSYS;

	gh = (struct genevehdr *)(skb->data + nhoff);
	gh_len = geneve_hlen(gh);
	type = gh->proto_type;

	rcu_read_lock();
	ptype = gro_find_complete_by_type(type);
	if (ptype)
		err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);

	rcu_read_unlock();
478 479 480

	skb_set_inner_mac_header(skb, nhoff + gh_len);

481 482 483 484 485
	return err;
}

/* Create new listen socket if needed */
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
486
						bool ipv6, bool ipv6_rx_csum)
487 488 489 490 491
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_sock *gs;
	struct socket *sock;
	struct udp_tunnel_sock_cfg tunnel_cfg;
492
	int h;
493 494 495 496 497

	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
	if (!gs)
		return ERR_PTR(-ENOMEM);

498
	sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
499 500 501 502 503 504 505
	if (IS_ERR(sock)) {
		kfree(gs);
		return ERR_CAST(sock);
	}

	gs->sock = sock;
	gs->refcnt = 1;
506 507
	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&gs->vni_list[h]);
508 509

	/* Initialize the geneve udp offloads structure */
510
	udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
511 512

	/* Mark socket as an encapsulation socket */
513
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
514 515
	tunnel_cfg.sk_user_data = gs;
	tunnel_cfg.encap_type = 1;
516 517
	tunnel_cfg.gro_receive = geneve_gro_receive;
	tunnel_cfg.gro_complete = geneve_gro_complete;
518 519 520 521 522 523 524
	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
	tunnel_cfg.encap_destroy = NULL;
	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
	list_add(&gs->list, &gn->sock_list);
	return gs;
}

525
static void __geneve_sock_release(struct geneve_sock *gs)
526
{
527
	if (!gs || --gs->refcnt)
528 529 530
		return;

	list_del(&gs->list);
531
	udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
532 533 534 535
	udp_tunnel_sock_release(gs->sock);
	kfree_rcu(gs, rcu);
}

536 537
static void geneve_sock_release(struct geneve_dev *geneve)
{
538
	struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4);
539
#if IS_ENABLED(CONFIG_IPV6)
540 541 542 543 544 545 546 547 548 549 550
	struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6);

	rcu_assign_pointer(geneve->sock6, NULL);
#endif

	rcu_assign_pointer(geneve->sock4, NULL);
	synchronize_net();

	__geneve_sock_release(gs4);
#if IS_ENABLED(CONFIG_IPV6)
	__geneve_sock_release(gs6);
551 552 553
#endif
}

554
static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
555
					    sa_family_t family,
556 557 558 559 560 561
					    __be16 dst_port)
{
	struct geneve_sock *gs;

	list_for_each_entry(gs, &gn->sock_list, list) {
		if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
562
		    geneve_get_sk_family(gs) == family) {
563 564 565 566 567 568
			return gs;
		}
	}
	return NULL;
}

569
static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
570 571
{
	struct net *net = geneve->net;
572
	struct geneve_net *gn = net_generic(net, geneve_net_id);
573
	struct geneve_sock *gs;
574
	__u8 vni[3];
575
	__u32 hash;
576

577
	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->info.key.tp_dst);
578 579 580 581 582
	if (gs) {
		gs->refcnt++;
		goto out;
	}

583 584
	gs = geneve_socket_create(net, geneve->info.key.tp_dst, ipv6,
				  geneve->use_udp6_rx_checksums);
585 586 587
	if (IS_ERR(gs))
		return PTR_ERR(gs);

588 589
out:
	gs->collect_md = geneve->collect_md;
590 591
#if IS_ENABLED(CONFIG_IPV6)
	if (ipv6)
592
		rcu_assign_pointer(geneve->sock6, gs);
593 594
	else
#endif
595
		rcu_assign_pointer(geneve->sock4, gs);
596

597 598
	tunnel_id_to_vni(geneve->info.key.tun_id, vni);
	hash = geneve_net_vni_hash(vni);
599
	hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
600 601 602
	return 0;
}

603 604 605
static int geneve_open(struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
606
	bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6);
607 608 609 610 611 612 613 614 615 616 617 618 619 620 621
	bool metadata = geneve->collect_md;
	int ret = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (ipv6 || metadata)
		ret = geneve_sock_add(geneve, true);
#endif
	if (!ret && (!ipv6 || metadata))
		ret = geneve_sock_add(geneve, false);
	if (ret < 0)
		geneve_sock_release(geneve);

	return ret;
}

622 623 624 625
static int geneve_stop(struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);

626 627
	if (!hlist_unhashed(&geneve->hlist))
		hlist_del_rcu(&geneve->hlist);
628
	geneve_sock_release(geneve);
629 630 631
	return 0;
}

632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
static void geneve_build_header(struct genevehdr *geneveh,
				__be16 tun_flags, u8 vni[3],
				u8 options_len, u8 *options)
{
	geneveh->ver = GENEVE_VER;
	geneveh->opt_len = options_len / 4;
	geneveh->oam = !!(tun_flags & TUNNEL_OAM);
	geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT);
	geneveh->rsvd1 = 0;
	memcpy(geneveh->vni, vni, 3);
	geneveh->proto_type = htons(ETH_P_TEB);
	geneveh->rsvd2 = 0;

	memcpy(geneveh->options, options, options_len);
}

648 649
static int geneve_build_skb(struct rtable *rt, struct sk_buff *skb,
			    __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
650
			    bool xnet)
651
{
652
	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
653 654 655
	struct genevehdr *gnvh;
	int min_headroom;
	int err;
656

657 658
	skb_scrub_packet(skb, xnet);

659 660 661
	min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr);
	err = skb_cow_head(skb, min_headroom);
662
	if (unlikely(err))
663 664
		goto free_rt;

665 666
	err = udp_tunnel_handle_offloads(skb, udp_sum);
	if (err)
667 668 669
		goto free_rt;

	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
670
	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
671 672

	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
673
	return 0;
674 675 676 677

free_rt:
	ip_rt_put(rt);
	return err;
678 679
}

680 681 682
#if IS_ENABLED(CONFIG_IPV6)
static int geneve6_build_skb(struct dst_entry *dst, struct sk_buff *skb,
			     __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt,
683
			     bool xnet)
684
{
685
	bool udp_sum = !!(tun_flags & TUNNEL_CSUM);
686 687 688 689 690 691 692 693 694
	struct genevehdr *gnvh;
	int min_headroom;
	int err;

	skb_scrub_packet(skb, xnet);

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
			+ GENEVE_BASE_HLEN + opt_len + sizeof(struct ipv6hdr);
	err = skb_cow_head(skb, min_headroom);
695
	if (unlikely(err))
696 697
		goto free_dst;

698
	err = udp_tunnel_handle_offloads(skb, udp_sum);
699
	if (err)
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
		goto free_dst;

	gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
	geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);

	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
	return 0;

free_dst:
	dst_release(dst);
	return err;
}
#endif

static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
				       struct net_device *dev,
				       struct flowi4 *fl4,
				       struct ip_tunnel_info *info)
718
{
719
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
720
	struct geneve_dev *geneve = netdev_priv(dev);
P
Paolo Abeni 已提交
721
	struct dst_cache *dst_cache;
722 723 724
	struct rtable *rt = NULL;
	__u8 tos;

725 726 727
	if (!rcu_dereference(geneve->sock4))
		return ERR_PTR(-EIO);

728 729 730
	memset(fl4, 0, sizeof(*fl4));
	fl4->flowi4_mark = skb->mark;
	fl4->flowi4_proto = IPPROTO_UDP;
731 732
	fl4->daddr = info->key.u.ipv4.dst;
	fl4->saddr = info->key.u.ipv4.src;
733

734 735 736 737
	tos = info->key.tos;
	if ((tos == 1) && !geneve->collect_md) {
		tos = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
		use_cache = false;
P
Paolo Abeni 已提交
738
	}
739
	fl4->flowi4_tos = RT_TOS(tos);
P
Paolo Abeni 已提交
740

741
	dst_cache = &info->dst_cache;
P
Paolo Abeni 已提交
742 743 744 745
	if (use_cache) {
		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
		if (rt)
			return rt;
746 747 748 749
	}
	rt = ip_route_output_key(geneve->net, fl4);
	if (IS_ERR(rt)) {
		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
750
		return ERR_PTR(-ENETUNREACH);
751 752 753 754
	}
	if (rt->dst.dev == dev) { /* is this necessary? */
		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
		ip_rt_put(rt);
755
		return ERR_PTR(-ELOOP);
756
	}
P
Paolo Abeni 已提交
757 758
	if (use_cache)
		dst_cache_set_ip4(dst_cache, &rt->dst, fl4->saddr);
759 760 761
	return rt;
}

762 763 764 765 766 767
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
					   struct net_device *dev,
					   struct flowi6 *fl6,
					   struct ip_tunnel_info *info)
{
768
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
769 770
	struct geneve_dev *geneve = netdev_priv(dev);
	struct dst_entry *dst = NULL;
P
Paolo Abeni 已提交
771
	struct dst_cache *dst_cache;
772
	struct geneve_sock *gs6;
773
	__u8 prio;
774

775 776 777 778
	gs6 = rcu_dereference(geneve->sock6);
	if (!gs6)
		return ERR_PTR(-EIO);

779 780 781
	memset(fl6, 0, sizeof(*fl6));
	fl6->flowi6_mark = skb->mark;
	fl6->flowi6_proto = IPPROTO_UDP;
782 783 784 785 786 787
	fl6->daddr = info->key.u.ipv6.dst;
	fl6->saddr = info->key.u.ipv6.src;
	prio = info->key.tos;
	if ((prio == 1) && !geneve->collect_md) {
		prio = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
		use_cache = false;
P
Paolo Abeni 已提交
788 789
	}

790 791 792
	fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
					   info->key.label);
	dst_cache = &info->dst_cache;
P
Paolo Abeni 已提交
793 794 795 796
	if (use_cache) {
		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
		if (dst)
			return dst;
797 798 799 800 801 802 803 804 805 806 807
	}
	if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) {
		netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr);
		return ERR_PTR(-ENETUNREACH);
	}
	if (dst->dev == dev) { /* is this necessary? */
		netdev_dbg(dev, "circular route to %pI6\n", &fl6->daddr);
		dst_release(dst);
		return ERR_PTR(-ELOOP);
	}

P
Paolo Abeni 已提交
808 809
	if (use_cache)
		dst_cache_set_ip6(dst_cache, dst, &fl6->saddr);
810 811 812 813
	return dst;
}
#endif

814 815
static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
			   struct geneve_dev *geneve, struct ip_tunnel_info *info)
816
{
817 818 819 820
	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
	struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
	const struct ip_tunnel_key *key = &info->key;
	struct rtable *rt;
821
	int err = -EINVAL;
822
	struct flowi4 fl4;
823
	u8 *opts = NULL;
824
	__u8 tos, ttl;
825
	__be16 sport;
826
	__be16 df;
827
	u8 vni[3];
828

829
	if (!gs4)
830
		return err;
831

832
	rt = geneve_get_v4_rt(skb, dev, &fl4, info);
833 834
	if (IS_ERR(rt))
		return PTR_ERR(rt);
835 836

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
837 838
	if (geneve->collect_md) {
		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
839
		ttl = key->ttl;
840
	} else {
841 842
		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
		ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
843
	}
844
	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
845

846 847 848
	tunnel_id_to_vni(key->tun_id, vni);
	if (info->options_len)
		opts = ip_tunnel_info_opts(info);
849

850 851 852 853 854
	skb_reset_mac_header(skb);
	err = geneve_build_skb(rt, skb, key->tun_flags, vni,
			       info->options_len, opts, xnet);
	if (unlikely(err))
		return err;
H
Haishuang Yan 已提交
855

856 857 858 859 860
	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
			    tos, ttl, df, sport, geneve->info.key.tp_dst,
			    !net_eq(geneve->net, dev_net(geneve->dev)),
			    !(info->key.tun_flags & TUNNEL_CSUM));
	return 0;
861 862
}

863
#if IS_ENABLED(CONFIG_IPV6)
864 865
static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
			    struct geneve_dev *geneve, struct ip_tunnel_info *info)
866
{
867 868 869
	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
	struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
	const struct ip_tunnel_key *key = &info->key;
870 871 872
	struct dst_entry *dst = NULL;
	int err = -EINVAL;
	struct flowi6 fl6;
873
	u8 *opts = NULL;
874
	__u8 prio, ttl;
875
	__be16 sport;
876
	u8 vni[3];
877

878
	if (!gs6)
879
		return err;
880 881

	dst = geneve_get_v6_dst(skb, dev, &fl6, info);
882 883
	if (IS_ERR(dst))
		return PTR_ERR(dst);
884 885

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
886 887 888 889 890 891 892 893 894 895 896
	if (geneve->collect_md) {
		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
		ttl = key->ttl;
	} else {
		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
					   ip_hdr(skb), skb);
		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
	}
	tunnel_id_to_vni(key->tun_id, vni);
	if (info->options_len)
		opts = ip_tunnel_info_opts(info);
897

898 899 900 901 902
	skb_reset_mac_header(skb);
	err = geneve6_build_skb(dst, skb, key->tun_flags, vni,
				info->options_len, opts, xnet);
	if (unlikely(err))
		return err;
903

904 905 906 907 908 909 910
	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
			     &fl6.saddr, &fl6.daddr, prio, ttl,
			     info->key.label, sport, geneve->info.key.tp_dst,
			     !(info->key.tun_flags & TUNNEL_CSUM));
	return 0;
}
#endif
911

912 913 914 915 916
static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
	struct ip_tunnel_info *info = NULL;
	int err;
917

918 919 920 921 922
	if (geneve->collect_md) {
		info = skb_tunnel_info(skb);
		if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
			err = -EINVAL;
			netdev_dbg(dev, "no tunnel metadata\n");
923
			goto tx_error;
924
		}
925
	} else {
926
		info = &geneve->info;
927
	}
928

929 930 931 932 933 934
#if IS_ENABLED(CONFIG_IPV6)
	if (info->mode & IP_TUNNEL_INFO_IPV6)
		err = geneve6_xmit_skb(skb, dev, geneve, info);
	else
#endif
		err = geneve_xmit_skb(skb, dev, geneve, info);
935

936 937
	if (likely(!err))
		return NETDEV_TX_OK;
938 939
tx_error:
	dev_kfree_skb(skb);
940

941 942 943 944
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
H
Haishuang Yan 已提交
945 946

	dev->stats.tx_errors++;
947 948 949
	return NETDEV_TX_OK;
}

950
static int geneve_change_mtu(struct net_device *dev, int new_mtu)
D
David Wragg 已提交
951
{
952 953
	/* Only possible if called internally, ndo_change_mtu path's new_mtu
	 * is guaranteed to be between dev->min_mtu and dev->max_mtu.
D
David Wragg 已提交
954
	 */
955 956
	if (new_mtu > dev->max_mtu)
		new_mtu = dev->max_mtu;
D
David Wragg 已提交
957

D
David Wragg 已提交
958 959 960 961
	dev->mtu = new_mtu;
	return 0;
}

962 963 964 965 966
static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	struct geneve_dev *geneve = netdev_priv(dev);

967
	if (ip_tunnel_info_af(info) == AF_INET) {
968 969 970
		struct rtable *rt;
		struct flowi4 fl4;

971 972 973
		rt = geneve_get_v4_rt(skb, dev, &fl4, info);
		if (IS_ERR(rt))
			return PTR_ERR(rt);
974

975 976 977 978
		ip_rt_put(rt);
		info->key.u.ipv4.src = fl4.saddr;
#if IS_ENABLED(CONFIG_IPV6)
	} else if (ip_tunnel_info_af(info) == AF_INET6) {
979 980 981
		struct dst_entry *dst;
		struct flowi6 fl6;

982 983 984 985 986 987 988 989 990 991
		dst = geneve_get_v6_dst(skb, dev, &fl6, info);
		if (IS_ERR(dst))
			return PTR_ERR(dst);

		dst_release(dst);
		info->key.u.ipv6.src = fl6.saddr;
#endif
	} else {
		return -EINVAL;
	}
992 993 994

	info->key.tp_src = udp_flow_src_port(geneve->net, skb,
					     1, USHRT_MAX, true);
995
	info->key.tp_dst = geneve->info.key.tp_dst;
996 997 998
	return 0;
}

999 1000 1001 1002 1003 1004 1005
static const struct net_device_ops geneve_netdev_ops = {
	.ndo_init		= geneve_init,
	.ndo_uninit		= geneve_uninit,
	.ndo_open		= geneve_open,
	.ndo_stop		= geneve_stop,
	.ndo_start_xmit		= geneve_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
D
David Wragg 已提交
1006
	.ndo_change_mtu		= geneve_change_mtu,
1007 1008
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
1009
	.ndo_fill_metadata_dst	= geneve_fill_metadata_dst,
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
};

static void geneve_get_drvinfo(struct net_device *dev,
			       struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
}

static const struct ethtool_ops geneve_ethtool_ops = {
	.get_drvinfo	= geneve_get_drvinfo,
	.get_link	= ethtool_op_get_link,
};

/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type geneve_type = {
	.name = "geneve",
};

1029
/* Calls the ndo_udp_tunnel_add of the caller in order to
1030
 * supply the listening GENEVE udp ports. Callers are expected
1031
 * to implement the ndo_udp_tunnel_add.
1032
 */
1033
static void geneve_push_rx_ports(struct net_device *dev)
1034 1035 1036 1037
{
	struct net *net = dev_net(dev);
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_sock *gs;
1038

1039
	rcu_read_lock();
1040 1041 1042
	list_for_each_entry_rcu(gs, &gn->sock_list, list)
		udp_tunnel_push_rx_port(dev, gs->sock,
					UDP_TUNNEL_TYPE_GENEVE);
1043 1044 1045
	rcu_read_unlock();
}

1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
/* Initialize the device structure. */
static void geneve_setup(struct net_device *dev)
{
	ether_setup(dev);

	dev->netdev_ops = &geneve_netdev_ops;
	dev->ethtool_ops = &geneve_ethtool_ops;
	dev->destructor = free_netdev;

	SET_NETDEV_DEVTYPE(dev, &geneve_type);

	dev->features    |= NETIF_F_LLTX;
	dev->features    |= NETIF_F_SG | NETIF_F_HW_CSUM;
	dev->features    |= NETIF_F_RXCSUM;
	dev->features    |= NETIF_F_GSO_SOFTWARE;

	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;

1065 1066 1067 1068 1069 1070 1071 1072
	/* MTU range: 68 - (something less than 65535) */
	dev->min_mtu = ETH_MIN_MTU;
	/* The max_mtu calculation does not take account of GENEVE
	 * options, to avoid excluding potentially valid
	 * configurations. This will be further reduced by IPvX hdr size.
	 */
	dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;

1073
	netif_keep_dst(dev);
J
Jiri Benc 已提交
1074
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1075
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1076
	eth_hw_addr_random(dev);
1077 1078 1079 1080 1081
}

static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
	[IFLA_GENEVE_ID]		= { .type = NLA_U32 },
	[IFLA_GENEVE_REMOTE]		= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1082
	[IFLA_GENEVE_REMOTE6]		= { .len = sizeof(struct in6_addr) },
1083
	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
1084
	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
1085
	[IFLA_GENEVE_LABEL]		= { .type = NLA_U32 },
1086
	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
1087
	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1088 1089 1090
	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
};

static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
			return -EINVAL;

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
			return -EADDRNOTAVAIL;
	}

	if (!data)
		return -EINVAL;

	if (data[IFLA_GENEVE_ID]) {
		__u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);

		if (vni >= GENEVE_VID_MASK)
			return -ERANGE;
	}

	return 0;
}

1116
static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
1117
					  const struct ip_tunnel_info *info,
1118 1119 1120
					  bool *tun_on_same_port,
					  bool *tun_collect_md)
{
1121
	struct geneve_dev *geneve, *t = NULL;
1122 1123 1124 1125

	*tun_on_same_port = false;
	*tun_collect_md = false;
	list_for_each_entry(geneve, &gn->geneve_list, next) {
1126
		if (info->key.tp_dst == geneve->info.key.tp_dst) {
1127 1128 1129
			*tun_collect_md = geneve->collect_md;
			*tun_on_same_port = true;
		}
1130 1131 1132
		if (info->key.tun_id == geneve->info.key.tun_id &&
		    info->key.tp_dst == geneve->info.key.tp_dst &&
		    !memcmp(&info->key.u, &geneve->info.key.u, sizeof(info->key.u)))
1133 1134 1135 1136 1137
			t = geneve;
	}
	return t;
}

1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
static bool is_all_zero(const u8 *fp, size_t size)
{
	int i;

	for (i = 0; i < size; i++)
		if (fp[i])
			return false;
	return true;
}

static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
{
	if (info->key.tun_id || info->key.tun_flags || info->key.tos ||
	    info->key.ttl || info->key.label || info->key.tp_src ||
	    !is_all_zero((const u8 *)&info->key.u, sizeof(info->key.u)))
		return false;
	else
		return true;
}

1158
static int geneve_configure(struct net *net, struct net_device *dev,
1159 1160
			    const struct ip_tunnel_info *info,
			    bool metadata, bool ipv6_rx_csum)
1161 1162
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
1163 1164
	struct geneve_dev *t, *geneve = netdev_priv(dev);
	bool tun_collect_md, tun_on_same_port;
P
Paolo Abeni 已提交
1165
	int err, encap_len;
1166

1167
	if (metadata && !is_tnl_info_zero(info))
1168
		return -EINVAL;
1169 1170 1171 1172

	geneve->net = net;
	geneve->dev = dev;

1173
	t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
1174 1175 1176
	if (t)
		return -EBUSY;

P
Paolo Abeni 已提交
1177 1178
	/* make enough headroom for basic scenario */
	encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
1179
	if (ip_tunnel_info_af(info) == AF_INET) {
P
Paolo Abeni 已提交
1180
		encap_len += sizeof(struct iphdr);
1181 1182
		dev->max_mtu -= sizeof(struct iphdr);
	} else {
P
Paolo Abeni 已提交
1183
		encap_len += sizeof(struct ipv6hdr);
1184 1185
		dev->max_mtu -= sizeof(struct ipv6hdr);
	}
P
Paolo Abeni 已提交
1186 1187
	dev->needed_headroom = encap_len + ETH_HLEN;

1188 1189 1190 1191 1192 1193 1194 1195
	if (metadata) {
		if (tun_on_same_port)
			return -EPERM;
	} else {
		if (tun_collect_md)
			return -EPERM;
	}

1196 1197 1198 1199
	dst_cache_reset(&geneve->info.dst_cache);
	geneve->info = *info;
	geneve->collect_md = metadata;
	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
P
Paolo Abeni 已提交
1200

1201 1202 1203 1204
	err = register_netdevice(dev);
	if (err)
		return err;

1205 1206 1207 1208
	list_add(&geneve->next, &gn->geneve_list);
	return 0;
}

1209 1210 1211 1212 1213 1214
static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
{
	memset(info, 0, sizeof(*info));
	info->key.tp_dst = htons(dst_port);
}

1215 1216 1217
static int geneve_newlink(struct net *net, struct net_device *dev,
			  struct nlattr *tb[], struct nlattr *data[])
{
1218 1219
	bool use_udp6_rx_checksums = false;
	struct ip_tunnel_info info;
1220
	bool metadata = false;
1221 1222

	init_tnl_info(&info, GENEVE_UDP_PORT);
1223

1224 1225 1226 1227
	if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
		return -EINVAL;

	if (data[IFLA_GENEVE_REMOTE]) {
1228
		info.key.u.ipv4.dst =
1229
			nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
1230 1231 1232 1233 1234

		if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
			netdev_dbg(dev, "multicast remote is unsupported\n");
			return -EINVAL;
		}
1235 1236 1237
	}

	if (data[IFLA_GENEVE_REMOTE6]) {
1238 1239 1240
 #if IS_ENABLED(CONFIG_IPV6)
		info.mode = IP_TUNNEL_INFO_IPV6;
		info.key.u.ipv6.dst =
1241 1242
			nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);

1243
		if (ipv6_addr_type(&info.key.u.ipv6.dst) &
1244 1245 1246 1247
		    IPV6_ADDR_LINKLOCAL) {
			netdev_dbg(dev, "link-local remote is unsupported\n");
			return -EINVAL;
		}
1248 1249 1250 1251 1252 1253 1254 1255 1256
		if (ipv6_addr_is_multicast(&info.key.u.ipv6.dst)) {
			netdev_dbg(dev, "multicast remote is unsupported\n");
			return -EINVAL;
		}
		info.key.tun_flags |= TUNNEL_CSUM;
		use_udp6_rx_checksums = true;
#else
		return -EPFNOSUPPORT;
#endif
1257 1258
	}

1259 1260 1261 1262
	if (data[IFLA_GENEVE_ID]) {
		__u32 vni;
		__u8 tvni[3];

1263
		vni = nla_get_u32(data[IFLA_GENEVE_ID]);
1264 1265 1266
		tvni[0] = (vni & 0x00ff0000) >> 16;
		tvni[1] = (vni & 0x0000ff00) >> 8;
		tvni[2] =  vni & 0x000000ff;
1267

1268 1269
		info.key.tun_id = vni_to_tunnel_id(tvni);
	}
1270
	if (data[IFLA_GENEVE_TTL])
1271
		info.key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
1272

1273
	if (data[IFLA_GENEVE_TOS])
1274
		info.key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
1275

1276 1277 1278 1279 1280 1281
	if (data[IFLA_GENEVE_LABEL]) {
		info.key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
				  IPV6_FLOWLABEL_MASK;
		if (info.key.label && (!(info.mode & IP_TUNNEL_INFO_IPV6)))
			return -EINVAL;
	}
1282

1283
	if (data[IFLA_GENEVE_PORT])
1284
		info.key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
1285

1286 1287
	if (data[IFLA_GENEVE_COLLECT_METADATA])
		metadata = true;
1288

1289
	if (data[IFLA_GENEVE_UDP_CSUM] &&
1290
	    !nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
1291
		info.key.tun_flags |= TUNNEL_CSUM;
1292 1293 1294

	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] &&
	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
1295
		info.key.tun_flags &= ~TUNNEL_CSUM;
1296 1297 1298

	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] &&
	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
1299
		use_udp6_rx_checksums = false;
1300

1301
	return geneve_configure(net, dev, &info, metadata, use_udp6_rx_checksums);
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314
}

static void geneve_dellink(struct net_device *dev, struct list_head *head)
{
	struct geneve_dev *geneve = netdev_priv(dev);

	list_del(&geneve->next);
	unregister_netdevice_queue(dev, head);
}

static size_t geneve_get_size(const struct net_device *dev)
{
	return nla_total_size(sizeof(__u32)) +	/* IFLA_GENEVE_ID */
1315
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
1316
		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
1317
		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
1318
		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
1319
		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
1320
		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
1321 1322 1323
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
1324 1325 1326 1327 1328 1329
		0;
}

static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
1330 1331
	struct ip_tunnel_info *info = &geneve->info;
	__u8 tmp_vni[3];
1332 1333
	__u32 vni;

1334 1335
	tunnel_id_to_vni(info->key.tun_id, tmp_vni);
	vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
1336 1337 1338
	if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
		goto nla_put_failure;

1339
	if (ip_tunnel_info_af(info) == AF_INET) {
1340
		if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
1341 1342 1343 1344 1345
				    info->key.u.ipv4.dst))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
			       !!(info->key.tun_flags & TUNNEL_CSUM)))
1346
			goto nla_put_failure;
1347

1348 1349 1350
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
1351 1352 1353 1354 1355 1356 1357 1358 1359
				     &info->key.u.ipv6.dst))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
			       !(info->key.tun_flags & TUNNEL_CSUM)))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
			       !geneve->use_udp6_rx_checksums))
1360 1361 1362
			goto nla_put_failure;
#endif
	}
1363

1364 1365 1366
	if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
	    nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
1367 1368
		goto nla_put_failure;

1369
	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
1370 1371
		goto nla_put_failure;

1372 1373 1374 1375
	if (geneve->collect_md) {
		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
			goto nla_put_failure;
	}
1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

static struct rtnl_link_ops geneve_link_ops __read_mostly = {
	.kind		= "geneve",
	.maxtype	= IFLA_GENEVE_MAX,
	.policy		= geneve_policy,
	.priv_size	= sizeof(struct geneve_dev),
	.setup		= geneve_setup,
	.validate	= geneve_validate,
	.newlink	= geneve_newlink,
	.dellink	= geneve_dellink,
	.get_size	= geneve_get_size,
	.fill_info	= geneve_fill_info,
};

1395 1396 1397 1398
struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
					u8 name_assign_type, u16 dst_port)
{
	struct nlattr *tb[IFLA_MAX + 1];
1399
	struct ip_tunnel_info info;
1400
	struct net_device *dev;
1401
	LIST_HEAD(list_kill);
1402 1403 1404 1405 1406 1407 1408 1409
	int err;

	memset(tb, 0, sizeof(tb));
	dev = rtnl_create_link(net, name, name_assign_type,
			       &geneve_link_ops, tb);
	if (IS_ERR(dev))
		return dev;

1410 1411
	init_tnl_info(&info, dst_port);
	err = geneve_configure(net, dev, &info, true, true);
1412 1413 1414 1415
	if (err) {
		free_netdev(dev);
		return ERR_PTR(err);
	}
1416 1417 1418 1419

	/* openvswitch users expect packet sizes to be unrestricted,
	 * so set the largest MTU we can.
	 */
1420
	err = geneve_change_mtu(dev, IP_MAX_MTU);
1421 1422 1423
	if (err)
		goto err;

1424 1425 1426 1427
	err = rtnl_configure_link(dev, NULL);
	if (err < 0)
		goto err;

1428
	return dev;
1429
err:
1430 1431
	geneve_dellink(dev, &list_kill);
	unregister_netdevice_many(&list_kill);
1432
	return ERR_PTR(err);
1433 1434 1435
}
EXPORT_SYMBOL_GPL(geneve_dev_create_fb);

1436 1437 1438 1439 1440
static int geneve_netdevice_event(struct notifier_block *unused,
				  unsigned long event, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);

1441
	if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
1442 1443 1444 1445 1446 1447 1448 1449 1450
		geneve_push_rx_ports(dev);

	return NOTIFY_DONE;
}

static struct notifier_block geneve_notifier_block __read_mostly = {
	.notifier_call = geneve_netdevice_event,
};

1451 1452 1453 1454 1455
static __net_init int geneve_init_net(struct net *net)
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);

	INIT_LIST_HEAD(&gn->geneve_list);
1456
	INIT_LIST_HEAD(&gn->sock_list);
1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502
	return 0;
}

static void __net_exit geneve_exit_net(struct net *net)
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_dev *geneve, *next;
	struct net_device *dev, *aux;
	LIST_HEAD(list);

	rtnl_lock();

	/* gather any geneve devices that were moved into this ns */
	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &geneve_link_ops)
			unregister_netdevice_queue(dev, &list);

	/* now gather any other geneve devices that were created in this ns */
	list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
		/* If geneve->dev is in the same netns, it was already added
		 * to the list by the previous loop.
		 */
		if (!net_eq(dev_net(geneve->dev), net))
			unregister_netdevice_queue(geneve->dev, &list);
	}

	/* unregister the devices gathered above */
	unregister_netdevice_many(&list);
	rtnl_unlock();
}

static struct pernet_operations geneve_net_ops = {
	.init = geneve_init_net,
	.exit = geneve_exit_net,
	.id   = &geneve_net_id,
	.size = sizeof(struct geneve_net),
};

static int __init geneve_init_module(void)
{
	int rc;

	rc = register_pernet_subsys(&geneve_net_ops);
	if (rc)
		goto out1;

1503
	rc = register_netdevice_notifier(&geneve_notifier_block);
1504 1505 1506
	if (rc)
		goto out2;

1507 1508 1509 1510
	rc = rtnl_link_register(&geneve_link_ops);
	if (rc)
		goto out3;

1511
	return 0;
1512 1513
out3:
	unregister_netdevice_notifier(&geneve_notifier_block);
1514 1515 1516 1517 1518 1519 1520 1521 1522 1523
out2:
	unregister_pernet_subsys(&geneve_net_ops);
out1:
	return rc;
}
late_initcall(geneve_init_module);

static void __exit geneve_cleanup_module(void)
{
	rtnl_link_unregister(&geneve_link_ops);
1524
	unregister_netdevice_notifier(&geneve_notifier_block);
1525 1526 1527 1528 1529 1530 1531 1532 1533
	unregister_pernet_subsys(&geneve_net_ops);
}
module_exit(geneve_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(GENEVE_NETDEV_VER);
MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic");
MODULE_ALIAS_RTNL_LINK("geneve");