geneve.c 37.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * GENEVE: Generic Network Virtualization Encapsulation
 *
 * Copyright (c) 2015 Red Hat, Inc.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/etherdevice.h>
#include <linux/hash.h>
17
#include <net/dst_metadata.h>
18
#include <net/gro_cells.h>
19 20
#include <net/rtnetlink.h>
#include <net/geneve.h>
21
#include <net/protocol.h>
22 23 24 25 26 27 28 29 30 31 32 33 34 35 36

#define GENEVE_NETDEV_VER	"0.6"

#define GENEVE_UDP_PORT		6081

#define GENEVE_N_VID		(1u << 24)
#define GENEVE_VID_MASK		(GENEVE_N_VID - 1)

#define VNI_HASH_BITS		10
#define VNI_HASH_SIZE		(1<<VNI_HASH_BITS)

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

37 38 39
#define GENEVE_VER 0
#define GENEVE_BASE_HLEN (sizeof(struct udphdr) + sizeof(struct genevehdr))

40 41
/* per-network namespace private data for this module */
struct geneve_net {
42 43
	struct list_head	geneve_list;
	struct list_head	sock_list;
44 45
};

46
static unsigned int geneve_net_id;
47

48 49 50 51 52
/* Pseudo network device */
struct geneve_dev {
	struct hlist_node  hlist;	/* vni hash table */
	struct net	   *net;	/* netns for packet i/o */
	struct net_device  *dev;	/* netdev for geneve tunnel */
53
	struct ip_tunnel_info info;
54
	struct geneve_sock __rcu *sock4;	/* IPv4 socket used for geneve tunnel */
55
#if IS_ENABLED(CONFIG_IPV6)
56
	struct geneve_sock __rcu *sock6;	/* IPv6 socket used for geneve tunnel */
57
#endif
58
	struct list_head   next;	/* geneve's per namespace list */
59
	struct gro_cells   gro_cells;
60 61
	bool		   collect_md;
	bool		   use_udp6_rx_checksums;
62 63
};

64 65 66 67 68 69
struct geneve_sock {
	bool			collect_md;
	struct list_head	list;
	struct socket		*sock;
	struct rcu_head		rcu;
	int			refcnt;
70
	struct hlist_head	vni_list[VNI_HASH_SIZE];
71
};
72 73 74 75 76 77 78 79 80

static inline __u32 geneve_net_vni_hash(u8 vni[3])
{
	__u32 vnid;

	vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
	return hash_32(vnid, VNI_HASH_BITS);
}

81 82 83 84 85 86 87 88 89 90 91
static __be64 vni_to_tunnel_id(const __u8 *vni)
{
#ifdef __BIG_ENDIAN
	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
#else
	return (__force __be64)(((__force u64)vni[0] << 40) |
				((__force u64)vni[1] << 48) |
				((__force u64)vni[2] << 56));
#endif
}

92 93 94 95 96 97 98 99 100 101 102 103 104 105
/* Convert 64 bit tunnel ID to 24 bit VNI. */
static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
{
#ifdef __BIG_ENDIAN
	vni[0] = (__force __u8)(tun_id >> 16);
	vni[1] = (__force __u8)(tun_id >> 8);
	vni[2] = (__force __u8)tun_id;
#else
	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
#endif
}

106 107 108 109 110 111 112 113 114 115 116
static bool eq_tun_id_and_vni(u8 *tun_id, u8 *vni)
{
#ifdef __BIG_ENDIAN
	return (vni[0] == tun_id[2]) &&
	       (vni[1] == tun_id[1]) &&
	       (vni[2] == tun_id[0]);
#else
	return !memcmp(vni, &tun_id[5], 3);
#endif
}

117 118 119 120 121
static sa_family_t geneve_get_sk_family(struct geneve_sock *gs)
{
	return gs->sock->sk->sk_family;
}

122
static struct geneve_dev *geneve_lookup(struct geneve_sock *gs,
123
					__be32 addr, u8 vni[])
124 125
{
	struct hlist_head *vni_list_head;
126
	struct geneve_dev *geneve;
127 128 129
	__u32 hash;

	/* Find the device for this VNI */
130
	hash = geneve_net_vni_hash(vni);
131
	vni_list_head = &gs->vni_list[hash];
132
	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
133
		if (eq_tun_id_and_vni((u8 *)&geneve->info.key.tun_id, vni) &&
134
		    addr == geneve->info.key.u.ipv4.dst)
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
			return geneve;
	}
	return NULL;
}

#if IS_ENABLED(CONFIG_IPV6)
static struct geneve_dev *geneve6_lookup(struct geneve_sock *gs,
					 struct in6_addr addr6, u8 vni[])
{
	struct hlist_head *vni_list_head;
	struct geneve_dev *geneve;
	__u32 hash;

	/* Find the device for this VNI */
	hash = geneve_net_vni_hash(vni);
	vni_list_head = &gs->vni_list[hash];
	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
152
		if (eq_tun_id_and_vni((u8 *)&geneve->info.key.tun_id, vni) &&
153
		    ipv6_addr_equal(&addr6, &geneve->info.key.u.ipv6.dst))
154
			return geneve;
155
	}
156 157
	return NULL;
}
158
#endif
159

160 161 162 163 164
static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb)
{
	return (struct genevehdr *)(udp_hdr(skb) + 1);
}

165 166
static struct geneve_dev *geneve_lookup_skb(struct geneve_sock *gs,
					    struct sk_buff *skb)
167
{
168
	static u8 zero_vni[3];
169
	u8 *vni;
170

171
	if (geneve_get_sk_family(gs) == AF_INET) {
172
		struct iphdr *iph;
173
		__be32 addr;
174

175
		iph = ip_hdr(skb); /* outer IP header... */
176

177 178 179 180
		if (gs->collect_md) {
			vni = zero_vni;
			addr = 0;
		} else {
181
			vni = geneve_hdr(skb)->vni;
182 183 184
			addr = iph->saddr;
		}

185
		return geneve_lookup(gs, addr, vni);
186
#if IS_ENABLED(CONFIG_IPV6)
187
	} else if (geneve_get_sk_family(gs) == AF_INET6) {
188
		static struct in6_addr zero_addr6;
189 190 191
		struct ipv6hdr *ip6h;
		struct in6_addr addr6;

192
		ip6h = ipv6_hdr(skb); /* outer IPv6 header... */
193

194 195 196 197
		if (gs->collect_md) {
			vni = zero_vni;
			addr6 = zero_addr6;
		} else {
198
			vni = geneve_hdr(skb)->vni;
199 200 201
			addr6 = ip6h->saddr;
		}

202
		return geneve6_lookup(gs, addr6, vni);
203 204
#endif
	}
205 206 207 208 209 210 211 212 213 214
	return NULL;
}

/* geneve receive/decap routine */
static void geneve_rx(struct geneve_dev *geneve, struct geneve_sock *gs,
		      struct sk_buff *skb)
{
	struct genevehdr *gnvh = geneve_hdr(skb);
	struct metadata_dst *tun_dst = NULL;
	struct pcpu_sw_netstats *stats;
215
	unsigned int len;
216 217
	int err = 0;
	void *oiph;
218

219
	if (ip_tunnel_collect_metadata() || gs->collect_md) {
220 221 222 223 224 225
		__be16 flags;

		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
			(gnvh->oam ? TUNNEL_OAM : 0) |
			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);

226
		tun_dst = udp_tun_rx_dst(skb, geneve_get_sk_family(gs), flags,
227 228
					 vni_to_tunnel_id(gnvh->vni),
					 gnvh->opt_len * 4);
229 230
		if (!tun_dst) {
			geneve->dev->stats.rx_dropped++;
231
			goto drop;
232
		}
233
		/* Update tunnel dst according to Geneve options. */
234 235
		ip_tunnel_info_opts_set(&tun_dst->u.tun_info,
					gnvh->options, gnvh->opt_len * 4);
236 237 238 239
	} else {
		/* Drop packets w/ critical options,
		 * since we don't support any...
		 */
240 241 242
		if (gnvh->critical) {
			geneve->dev->stats.rx_frame_errors++;
			geneve->dev->stats.rx_errors++;
243
			goto drop;
244
		}
245
	}
246 247 248 249 250

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, geneve->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

251 252 253
	if (tun_dst)
		skb_dst_set(skb, &tun_dst->dst);

254
	/* Ignore packet loops (and multicast echo) */
255 256
	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr)) {
		geneve->dev->stats.rx_errors++;
257
		goto drop;
258
	}
259

260
	oiph = skb_network_header(skb);
261 262
	skb_reset_network_header(skb);

263 264
	if (geneve_get_sk_family(gs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
265
#if IS_ENABLED(CONFIG_IPV6)
266 267
	else
		err = IP6_ECN_decapsulate(oiph, skb);
268
#endif
269 270

	if (unlikely(err)) {
271
		if (log_ecn_error) {
272
			if (geneve_get_sk_family(gs) == AF_INET)
273 274
				net_info_ratelimited("non-ECT from %pI4 "
						     "with TOS=%#x\n",
275 276
						     &((struct iphdr *)oiph)->saddr,
						     ((struct iphdr *)oiph)->tos);
277
#if IS_ENABLED(CONFIG_IPV6)
278
			else
279
				net_info_ratelimited("non-ECT from %pI6\n",
280
						     &((struct ipv6hdr *)oiph)->saddr);
281 282
#endif
		}
283 284 285 286 287 288 289
		if (err > 1) {
			++geneve->dev->stats.rx_frame_errors;
			++geneve->dev->stats.rx_errors;
			goto drop;
		}
	}

290 291 292 293 294 295 296 297 298
	len = skb->len;
	err = gro_cells_receive(&geneve->gro_cells, skb);
	if (likely(err == NET_RX_SUCCESS)) {
		stats = this_cpu_ptr(geneve->dev->tstats);
		u64_stats_update_begin(&stats->syncp);
		stats->rx_packets++;
		stats->rx_bytes += len;
		u64_stats_update_end(&stats->syncp);
	}
299 300 301 302 303 304 305 306 307
	return;
drop:
	/* Consume bad packet */
	kfree_skb(skb);
}

/* Setup stats when device is created */
static int geneve_init(struct net_device *dev)
{
308 309 310
	struct geneve_dev *geneve = netdev_priv(dev);
	int err;

311 312 313
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
	if (!dev->tstats)
		return -ENOMEM;
314 315 316 317 318 319 320

	err = gro_cells_init(&geneve->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

321
	err = dst_cache_init(&geneve->info.dst_cache, GFP_KERNEL);
P
Paolo Abeni 已提交
322 323 324 325 326
	if (err) {
		free_percpu(dev->tstats);
		gro_cells_destroy(&geneve->gro_cells);
		return err;
	}
327 328 329 330 331
	return 0;
}

static void geneve_uninit(struct net_device *dev)
{
332 333
	struct geneve_dev *geneve = netdev_priv(dev);

334
	dst_cache_destroy(&geneve->info.dst_cache);
335
	gro_cells_destroy(&geneve->gro_cells);
336 337 338
	free_percpu(dev->tstats);
}

339 340 341 342
/* Callback from net/ipv4/udp.c to receive packets */
static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
{
	struct genevehdr *geneveh;
343
	struct geneve_dev *geneve;
344 345 346
	struct geneve_sock *gs;
	int opts_len;

347
	/* Need UDP and Geneve header to be present */
348
	if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN)))
349
		goto drop;
350 351 352 353

	/* Return packets with reserved bits set */
	geneveh = geneve_hdr(skb);
	if (unlikely(geneveh->ver != GENEVE_VER))
354
		goto drop;
355 356

	if (unlikely(geneveh->proto_type != htons(ETH_P_TEB)))
357
		goto drop;
358

359 360 361 362 363 364 365 366
	gs = rcu_dereference_sk_user_data(sk);
	if (!gs)
		goto drop;

	geneve = geneve_lookup_skb(gs, skb);
	if (!geneve)
		goto drop;

367 368
	opts_len = geneveh->opt_len * 4;
	if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len,
369
				 htons(ETH_P_TEB),
370 371
				 !net_eq(geneve->net, dev_net(geneve->dev)))) {
		geneve->dev->stats.rx_dropped++;
372
		goto drop;
373
	}
374

375
	geneve_rx(geneve, gs, skb);
376 377 378 379 380 381 382 383 384
	return 0;

drop:
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
}

static struct socket *geneve_create_sock(struct net *net, bool ipv6,
385
					 __be16 port, bool ipv6_rx_csum)
386 387 388 389 390 391 392 393 394
{
	struct socket *sock;
	struct udp_port_cfg udp_conf;
	int err;

	memset(&udp_conf, 0, sizeof(udp_conf));

	if (ipv6) {
		udp_conf.family = AF_INET6;
395
		udp_conf.ipv6_v6only = 1;
396
		udp_conf.use_udp6_rx_checksums = ipv6_rx_csum;
397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416
	} else {
		udp_conf.family = AF_INET;
		udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
	}

	udp_conf.local_udp_port = port;

	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);

	return sock;
}

static int geneve_hlen(struct genevehdr *gh)
{
	return sizeof(*gh) + gh->opt_len * 4;
}

417 418 419
static struct sk_buff **geneve_gro_receive(struct sock *sk,
					   struct sk_buff **head,
					   struct sk_buff *skb)
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
{
	struct sk_buff *p, **pp = NULL;
	struct genevehdr *gh, *gh2;
	unsigned int hlen, gh_len, off_gnv;
	const struct packet_offload *ptype;
	__be16 type;
	int flush = 1;

	off_gnv = skb_gro_offset(skb);
	hlen = off_gnv + sizeof(*gh);
	gh = skb_gro_header_fast(skb, off_gnv);
	if (skb_gro_header_hard(skb, hlen)) {
		gh = skb_gro_header_slow(skb, hlen, off_gnv);
		if (unlikely(!gh))
			goto out;
	}

	if (gh->ver != GENEVE_VER || gh->oam)
		goto out;
	gh_len = geneve_hlen(gh);

	hlen = off_gnv + gh_len;
	if (skb_gro_header_hard(skb, hlen)) {
		gh = skb_gro_header_slow(skb, hlen, off_gnv);
		if (unlikely(!gh))
			goto out;
	}

	for (p = *head; p; p = p->next) {
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		gh2 = (struct genevehdr *)(p->data + off_gnv);
		if (gh->opt_len != gh2->opt_len ||
		    memcmp(gh, gh2, gh_len)) {
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

	type = gh->proto_type;

	rcu_read_lock();
	ptype = gro_find_receive_by_type(type);
464
	if (!ptype)
465 466 467 468
		goto out_unlock;

	skb_gro_pull(skb, gh_len);
	skb_gro_postpull_rcsum(skb, gh, gh_len);
S
Sabrina Dubroca 已提交
469
	pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb);
470
	flush = 0;
471 472 473 474 475 476 477 478 479

out_unlock:
	rcu_read_unlock();
out:
	NAPI_GRO_CB(skb)->flush |= flush;

	return pp;
}

480 481
static int geneve_gro_complete(struct sock *sk, struct sk_buff *skb,
			       int nhoff)
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
{
	struct genevehdr *gh;
	struct packet_offload *ptype;
	__be16 type;
	int gh_len;
	int err = -ENOSYS;

	gh = (struct genevehdr *)(skb->data + nhoff);
	gh_len = geneve_hlen(gh);
	type = gh->proto_type;

	rcu_read_lock();
	ptype = gro_find_complete_by_type(type);
	if (ptype)
		err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);

	rcu_read_unlock();
499 500 501

	skb_set_inner_mac_header(skb, nhoff + gh_len);

502 503 504 505 506
	return err;
}

/* Create new listen socket if needed */
static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port,
507
						bool ipv6, bool ipv6_rx_csum)
508 509 510 511 512
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_sock *gs;
	struct socket *sock;
	struct udp_tunnel_sock_cfg tunnel_cfg;
513
	int h;
514 515 516 517 518

	gs = kzalloc(sizeof(*gs), GFP_KERNEL);
	if (!gs)
		return ERR_PTR(-ENOMEM);

519
	sock = geneve_create_sock(net, ipv6, port, ipv6_rx_csum);
520 521 522 523 524 525 526
	if (IS_ERR(sock)) {
		kfree(gs);
		return ERR_CAST(sock);
	}

	gs->sock = sock;
	gs->refcnt = 1;
527 528
	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&gs->vni_list[h]);
529 530

	/* Initialize the geneve udp offloads structure */
531
	udp_tunnel_notify_add_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
532 533

	/* Mark socket as an encapsulation socket */
534
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
535 536
	tunnel_cfg.sk_user_data = gs;
	tunnel_cfg.encap_type = 1;
537 538
	tunnel_cfg.gro_receive = geneve_gro_receive;
	tunnel_cfg.gro_complete = geneve_gro_complete;
539 540 541 542 543 544 545
	tunnel_cfg.encap_rcv = geneve_udp_encap_recv;
	tunnel_cfg.encap_destroy = NULL;
	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
	list_add(&gs->list, &gn->sock_list);
	return gs;
}

546
static void __geneve_sock_release(struct geneve_sock *gs)
547
{
548
	if (!gs || --gs->refcnt)
549 550 551
		return;

	list_del(&gs->list);
552
	udp_tunnel_notify_del_rx_port(gs->sock, UDP_TUNNEL_TYPE_GENEVE);
553 554 555 556
	udp_tunnel_sock_release(gs->sock);
	kfree_rcu(gs, rcu);
}

557 558
static void geneve_sock_release(struct geneve_dev *geneve)
{
559
	struct geneve_sock *gs4 = rtnl_dereference(geneve->sock4);
560
#if IS_ENABLED(CONFIG_IPV6)
561 562 563 564 565 566 567 568 569 570 571
	struct geneve_sock *gs6 = rtnl_dereference(geneve->sock6);

	rcu_assign_pointer(geneve->sock6, NULL);
#endif

	rcu_assign_pointer(geneve->sock4, NULL);
	synchronize_net();

	__geneve_sock_release(gs4);
#if IS_ENABLED(CONFIG_IPV6)
	__geneve_sock_release(gs6);
572 573 574
#endif
}

575
static struct geneve_sock *geneve_find_sock(struct geneve_net *gn,
576
					    sa_family_t family,
577 578 579 580 581 582
					    __be16 dst_port)
{
	struct geneve_sock *gs;

	list_for_each_entry(gs, &gn->sock_list, list) {
		if (inet_sk(gs->sock->sk)->inet_sport == dst_port &&
583
		    geneve_get_sk_family(gs) == family) {
584 585 586 587 588 589
			return gs;
		}
	}
	return NULL;
}

590
static int geneve_sock_add(struct geneve_dev *geneve, bool ipv6)
591 592
{
	struct net *net = geneve->net;
593
	struct geneve_net *gn = net_generic(net, geneve_net_id);
594
	struct geneve_sock *gs;
595
	__u8 vni[3];
596
	__u32 hash;
597

598
	gs = geneve_find_sock(gn, ipv6 ? AF_INET6 : AF_INET, geneve->info.key.tp_dst);
599 600 601 602 603
	if (gs) {
		gs->refcnt++;
		goto out;
	}

604 605
	gs = geneve_socket_create(net, geneve->info.key.tp_dst, ipv6,
				  geneve->use_udp6_rx_checksums);
606 607 608
	if (IS_ERR(gs))
		return PTR_ERR(gs);

609 610
out:
	gs->collect_md = geneve->collect_md;
611 612
#if IS_ENABLED(CONFIG_IPV6)
	if (ipv6)
613
		rcu_assign_pointer(geneve->sock6, gs);
614 615
	else
#endif
616
		rcu_assign_pointer(geneve->sock4, gs);
617

618 619
	tunnel_id_to_vni(geneve->info.key.tun_id, vni);
	hash = geneve_net_vni_hash(vni);
620
	hlist_add_head_rcu(&geneve->hlist, &gs->vni_list[hash]);
621 622 623
	return 0;
}

624 625 626
static int geneve_open(struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
627
	bool ipv6 = !!(geneve->info.mode & IP_TUNNEL_INFO_IPV6);
628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
	bool metadata = geneve->collect_md;
	int ret = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (ipv6 || metadata)
		ret = geneve_sock_add(geneve, true);
#endif
	if (!ret && (!ipv6 || metadata))
		ret = geneve_sock_add(geneve, false);
	if (ret < 0)
		geneve_sock_release(geneve);

	return ret;
}

643 644 645 646
static int geneve_stop(struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);

647 648
	if (!hlist_unhashed(&geneve->hlist))
		hlist_del_rcu(&geneve->hlist);
649
	geneve_sock_release(geneve);
650 651 652
	return 0;
}

653
static void geneve_build_header(struct genevehdr *geneveh,
654
				const struct ip_tunnel_info *info)
655 656
{
	geneveh->ver = GENEVE_VER;
657 658 659
	geneveh->opt_len = info->options_len / 4;
	geneveh->oam = !!(info->key.tun_flags & TUNNEL_OAM);
	geneveh->critical = !!(info->key.tun_flags & TUNNEL_CRIT_OPT);
660
	geneveh->rsvd1 = 0;
661
	tunnel_id_to_vni(info->key.tun_id, geneveh->vni);
662 663 664
	geneveh->proto_type = htons(ETH_P_TEB);
	geneveh->rsvd2 = 0;

665
	ip_tunnel_info_opts_get(geneveh->options, info);
666 667
}

668 669 670
static int geneve_build_skb(struct dst_entry *dst, struct sk_buff *skb,
			    const struct ip_tunnel_info *info,
			    bool xnet, int ip_hdr_len)
671
{
672
	bool udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
673 674 675 676
	struct genevehdr *gnvh;
	int min_headroom;
	int err;

677
	skb_reset_mac_header(skb);
678 679
	skb_scrub_packet(skb, xnet);

680 681
	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len +
		       GENEVE_BASE_HLEN + info->options_len + ip_hdr_len;
682
	err = skb_cow_head(skb, min_headroom);
683
	if (unlikely(err))
684 685
		goto free_dst;

686
	err = udp_tunnel_handle_offloads(skb, udp_sum);
687
	if (err)
688 689
		goto free_dst;

690
	gnvh = __skb_push(skb, sizeof(*gnvh) + info->options_len);
691
	geneve_build_header(gnvh, info);
692 693 694 695 696 697 698 699 700 701 702
	skb_set_inner_protocol(skb, htons(ETH_P_TEB));
	return 0;

free_dst:
	dst_release(dst);
	return err;
}

static struct rtable *geneve_get_v4_rt(struct sk_buff *skb,
				       struct net_device *dev,
				       struct flowi4 *fl4,
703
				       const struct ip_tunnel_info *info)
704
{
705
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
706
	struct geneve_dev *geneve = netdev_priv(dev);
P
Paolo Abeni 已提交
707
	struct dst_cache *dst_cache;
708 709 710
	struct rtable *rt = NULL;
	__u8 tos;

711 712 713
	if (!rcu_dereference(geneve->sock4))
		return ERR_PTR(-EIO);

714 715 716
	memset(fl4, 0, sizeof(*fl4));
	fl4->flowi4_mark = skb->mark;
	fl4->flowi4_proto = IPPROTO_UDP;
717 718
	fl4->daddr = info->key.u.ipv4.dst;
	fl4->saddr = info->key.u.ipv4.src;
719

720 721 722 723
	tos = info->key.tos;
	if ((tos == 1) && !geneve->collect_md) {
		tos = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
		use_cache = false;
P
Paolo Abeni 已提交
724
	}
725
	fl4->flowi4_tos = RT_TOS(tos);
P
Paolo Abeni 已提交
726

727
	dst_cache = (struct dst_cache *)&info->dst_cache;
P
Paolo Abeni 已提交
728 729 730 731
	if (use_cache) {
		rt = dst_cache_get_ip4(dst_cache, &fl4->saddr);
		if (rt)
			return rt;
732 733 734 735
	}
	rt = ip_route_output_key(geneve->net, fl4);
	if (IS_ERR(rt)) {
		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
736
		return ERR_PTR(-ENETUNREACH);
737 738 739 740
	}
	if (rt->dst.dev == dev) { /* is this necessary? */
		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
		ip_rt_put(rt);
741
		return ERR_PTR(-ELOOP);
742
	}
P
Paolo Abeni 已提交
743 744
	if (use_cache)
		dst_cache_set_ip4(dst_cache, &rt->dst, fl4->saddr);
745 746 747
	return rt;
}

748 749 750 751
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb,
					   struct net_device *dev,
					   struct flowi6 *fl6,
752
					   const struct ip_tunnel_info *info)
753
{
754
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
755 756
	struct geneve_dev *geneve = netdev_priv(dev);
	struct dst_entry *dst = NULL;
P
Paolo Abeni 已提交
757
	struct dst_cache *dst_cache;
758
	struct geneve_sock *gs6;
759
	__u8 prio;
760

761 762 763 764
	gs6 = rcu_dereference(geneve->sock6);
	if (!gs6)
		return ERR_PTR(-EIO);

765 766 767
	memset(fl6, 0, sizeof(*fl6));
	fl6->flowi6_mark = skb->mark;
	fl6->flowi6_proto = IPPROTO_UDP;
768 769 770 771 772 773
	fl6->daddr = info->key.u.ipv6.dst;
	fl6->saddr = info->key.u.ipv6.src;
	prio = info->key.tos;
	if ((prio == 1) && !geneve->collect_md) {
		prio = ip_tunnel_get_dsfield(ip_hdr(skb), skb);
		use_cache = false;
P
Paolo Abeni 已提交
774 775
	}

776 777
	fl6->flowlabel = ip6_make_flowinfo(RT_TOS(prio),
					   info->key.label);
778
	dst_cache = (struct dst_cache *)&info->dst_cache;
P
Paolo Abeni 已提交
779 780 781 782
	if (use_cache) {
		dst = dst_cache_get_ip6(dst_cache, &fl6->saddr);
		if (dst)
			return dst;
783 784 785 786 787 788 789 790 791 792 793
	}
	if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) {
		netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr);
		return ERR_PTR(-ENETUNREACH);
	}
	if (dst->dev == dev) { /* is this necessary? */
		netdev_dbg(dev, "circular route to %pI6\n", &fl6->daddr);
		dst_release(dst);
		return ERR_PTR(-ELOOP);
	}

P
Paolo Abeni 已提交
794 795
	if (use_cache)
		dst_cache_set_ip6(dst_cache, dst, &fl6->saddr);
796 797 798 799
	return dst;
}
#endif

800
static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev,
801 802
			   struct geneve_dev *geneve,
			   const struct ip_tunnel_info *info)
803
{
804 805 806 807
	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
	struct geneve_sock *gs4 = rcu_dereference(geneve->sock4);
	const struct ip_tunnel_key *key = &info->key;
	struct rtable *rt;
808
	struct flowi4 fl4;
809
	__u8 tos, ttl;
810
	__be16 sport;
811
	__be16 df;
812
	int err;
813

814
	rt = geneve_get_v4_rt(skb, dev, &fl4, info);
815 816
	if (IS_ERR(rt))
		return PTR_ERR(rt);
817 818

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
819 820
	if (geneve->collect_md) {
		tos = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
821
		ttl = key->ttl;
822
	} else {
823 824
		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb);
		ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst);
825
	}
826
	df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
827

828
	err = geneve_build_skb(&rt->dst, skb, info, xnet, sizeof(struct iphdr));
829 830
	if (unlikely(err))
		return err;
H
Haishuang Yan 已提交
831

832 833 834 835 836
	udp_tunnel_xmit_skb(rt, gs4->sock->sk, skb, fl4.saddr, fl4.daddr,
			    tos, ttl, df, sport, geneve->info.key.tp_dst,
			    !net_eq(geneve->net, dev_net(geneve->dev)),
			    !(info->key.tun_flags & TUNNEL_CSUM));
	return 0;
837 838
}

839
#if IS_ENABLED(CONFIG_IPV6)
840
static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev,
841 842
			    struct geneve_dev *geneve,
			    const struct ip_tunnel_info *info)
843
{
844 845 846
	bool xnet = !net_eq(geneve->net, dev_net(geneve->dev));
	struct geneve_sock *gs6 = rcu_dereference(geneve->sock6);
	const struct ip_tunnel_key *key = &info->key;
847 848
	struct dst_entry *dst = NULL;
	struct flowi6 fl6;
849
	__u8 prio, ttl;
850
	__be16 sport;
851
	int err;
852 853

	dst = geneve_get_v6_dst(skb, dev, &fl6, info);
854 855
	if (IS_ERR(dst))
		return PTR_ERR(dst);
856 857

	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
858 859 860 861 862 863 864 865
	if (geneve->collect_md) {
		prio = ip_tunnel_ecn_encap(key->tos, ip_hdr(skb), skb);
		ttl = key->ttl;
	} else {
		prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel),
					   ip_hdr(skb), skb);
		ttl = key->ttl ? : ip6_dst_hoplimit(dst);
	}
866
	err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr));
867 868
	if (unlikely(err))
		return err;
869

870 871 872 873 874 875 876
	udp_tunnel6_xmit_skb(dst, gs6->sock->sk, skb, dev,
			     &fl6.saddr, &fl6.daddr, prio, ttl,
			     info->key.label, sport, geneve->info.key.tp_dst,
			     !(info->key.tun_flags & TUNNEL_CSUM));
	return 0;
}
#endif
877

878 879 880 881 882
static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
	struct ip_tunnel_info *info = NULL;
	int err;
883

884 885 886 887 888
	if (geneve->collect_md) {
		info = skb_tunnel_info(skb);
		if (unlikely(!info || !(info->mode & IP_TUNNEL_INFO_TX))) {
			err = -EINVAL;
			netdev_dbg(dev, "no tunnel metadata\n");
889
			goto tx_error;
890
		}
891
	} else {
892
		info = &geneve->info;
893
	}
894

J
Jakub Kicinski 已提交
895
	rcu_read_lock();
896 897 898 899 900 901
#if IS_ENABLED(CONFIG_IPV6)
	if (info->mode & IP_TUNNEL_INFO_IPV6)
		err = geneve6_xmit_skb(skb, dev, geneve, info);
	else
#endif
		err = geneve_xmit_skb(skb, dev, geneve, info);
J
Jakub Kicinski 已提交
902
	rcu_read_unlock();
903

904 905
	if (likely(!err))
		return NETDEV_TX_OK;
906 907
tx_error:
	dev_kfree_skb(skb);
908

909 910 911 912
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
H
Haishuang Yan 已提交
913 914

	dev->stats.tx_errors++;
915 916 917
	return NETDEV_TX_OK;
}

918
static int geneve_change_mtu(struct net_device *dev, int new_mtu)
D
David Wragg 已提交
919
{
920 921
	/* Only possible if called internally, ndo_change_mtu path's new_mtu
	 * is guaranteed to be between dev->min_mtu and dev->max_mtu.
D
David Wragg 已提交
922
	 */
923 924
	if (new_mtu > dev->max_mtu)
		new_mtu = dev->max_mtu;
D
David Wragg 已提交
925

D
David Wragg 已提交
926 927 928 929
	dev->mtu = new_mtu;
	return 0;
}

930 931 932 933 934
static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	struct geneve_dev *geneve = netdev_priv(dev);

935
	if (ip_tunnel_info_af(info) == AF_INET) {
936 937 938
		struct rtable *rt;
		struct flowi4 fl4;

939 940 941
		rt = geneve_get_v4_rt(skb, dev, &fl4, info);
		if (IS_ERR(rt))
			return PTR_ERR(rt);
942

943 944 945 946
		ip_rt_put(rt);
		info->key.u.ipv4.src = fl4.saddr;
#if IS_ENABLED(CONFIG_IPV6)
	} else if (ip_tunnel_info_af(info) == AF_INET6) {
947 948 949
		struct dst_entry *dst;
		struct flowi6 fl6;

950 951 952 953 954 955 956 957 958 959
		dst = geneve_get_v6_dst(skb, dev, &fl6, info);
		if (IS_ERR(dst))
			return PTR_ERR(dst);

		dst_release(dst);
		info->key.u.ipv6.src = fl6.saddr;
#endif
	} else {
		return -EINVAL;
	}
960 961 962

	info->key.tp_src = udp_flow_src_port(geneve->net, skb,
					     1, USHRT_MAX, true);
963
	info->key.tp_dst = geneve->info.key.tp_dst;
964 965 966
	return 0;
}

967 968 969 970 971 972 973
static const struct net_device_ops geneve_netdev_ops = {
	.ndo_init		= geneve_init,
	.ndo_uninit		= geneve_uninit,
	.ndo_open		= geneve_open,
	.ndo_stop		= geneve_stop,
	.ndo_start_xmit		= geneve_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
D
David Wragg 已提交
974
	.ndo_change_mtu		= geneve_change_mtu,
975 976
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
977
	.ndo_fill_metadata_dst	= geneve_fill_metadata_dst,
978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996
};

static void geneve_get_drvinfo(struct net_device *dev,
			       struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
}

static const struct ethtool_ops geneve_ethtool_ops = {
	.get_drvinfo	= geneve_get_drvinfo,
	.get_link	= ethtool_op_get_link,
};

/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type geneve_type = {
	.name = "geneve",
};

997
/* Calls the ndo_udp_tunnel_add of the caller in order to
998
 * supply the listening GENEVE udp ports. Callers are expected
999
 * to implement the ndo_udp_tunnel_add.
1000
 */
1001
static void geneve_push_rx_ports(struct net_device *dev)
1002 1003 1004 1005
{
	struct net *net = dev_net(dev);
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_sock *gs;
1006

1007
	rcu_read_lock();
1008 1009 1010
	list_for_each_entry_rcu(gs, &gn->sock_list, list)
		udp_tunnel_push_rx_port(dev, gs->sock,
					UDP_TUNNEL_TYPE_GENEVE);
1011 1012 1013
	rcu_read_unlock();
}

1014 1015 1016 1017 1018 1019 1020
/* Initialize the device structure. */
static void geneve_setup(struct net_device *dev)
{
	ether_setup(dev);

	dev->netdev_ops = &geneve_netdev_ops;
	dev->ethtool_ops = &geneve_ethtool_ops;
1021
	dev->needs_free_netdev = true;
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032

	SET_NETDEV_DEVTYPE(dev, &geneve_type);

	dev->features    |= NETIF_F_LLTX;
	dev->features    |= NETIF_F_SG | NETIF_F_HW_CSUM;
	dev->features    |= NETIF_F_RXCSUM;
	dev->features    |= NETIF_F_GSO_SOFTWARE;

	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;

1033 1034 1035 1036 1037 1038 1039 1040
	/* MTU range: 68 - (something less than 65535) */
	dev->min_mtu = ETH_MIN_MTU;
	/* The max_mtu calculation does not take account of GENEVE
	 * options, to avoid excluding potentially valid
	 * configurations. This will be further reduced by IPvX hdr size.
	 */
	dev->max_mtu = IP_MAX_MTU - GENEVE_BASE_HLEN - dev->hard_header_len;

1041
	netif_keep_dst(dev);
J
Jiri Benc 已提交
1042
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
1043
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
1044
	eth_hw_addr_random(dev);
1045 1046 1047 1048 1049
}

static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
	[IFLA_GENEVE_ID]		= { .type = NLA_U32 },
	[IFLA_GENEVE_REMOTE]		= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1050
	[IFLA_GENEVE_REMOTE6]		= { .len = sizeof(struct in6_addr) },
1051
	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
1052
	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
1053
	[IFLA_GENEVE_LABEL]		= { .type = NLA_U32 },
1054
	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
1055
	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
1056 1057 1058
	[IFLA_GENEVE_UDP_CSUM]		= { .type = NLA_U8 },
	[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
};

static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
			return -EINVAL;

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
			return -EADDRNOTAVAIL;
	}

	if (!data)
		return -EINVAL;

	if (data[IFLA_GENEVE_ID]) {
		__u32 vni =  nla_get_u32(data[IFLA_GENEVE_ID]);

		if (vni >= GENEVE_VID_MASK)
			return -ERANGE;
	}

	return 0;
}

1084
static struct geneve_dev *geneve_find_dev(struct geneve_net *gn,
1085
					  const struct ip_tunnel_info *info,
1086 1087 1088
					  bool *tun_on_same_port,
					  bool *tun_collect_md)
{
1089
	struct geneve_dev *geneve, *t = NULL;
1090 1091 1092 1093

	*tun_on_same_port = false;
	*tun_collect_md = false;
	list_for_each_entry(geneve, &gn->geneve_list, next) {
1094
		if (info->key.tp_dst == geneve->info.key.tp_dst) {
1095 1096 1097
			*tun_collect_md = geneve->collect_md;
			*tun_on_same_port = true;
		}
1098 1099 1100
		if (info->key.tun_id == geneve->info.key.tun_id &&
		    info->key.tp_dst == geneve->info.key.tp_dst &&
		    !memcmp(&info->key.u, &geneve->info.key.u, sizeof(info->key.u)))
1101 1102 1103 1104 1105
			t = geneve;
	}
	return t;
}

1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
static bool is_all_zero(const u8 *fp, size_t size)
{
	int i;

	for (i = 0; i < size; i++)
		if (fp[i])
			return false;
	return true;
}

static bool is_tnl_info_zero(const struct ip_tunnel_info *info)
{
	if (info->key.tun_id || info->key.tun_flags || info->key.tos ||
	    info->key.ttl || info->key.label || info->key.tp_src ||
	    !is_all_zero((const u8 *)&info->key.u, sizeof(info->key.u)))
		return false;
	else
		return true;
}

1126
static int geneve_configure(struct net *net, struct net_device *dev,
1127 1128
			    const struct ip_tunnel_info *info,
			    bool metadata, bool ipv6_rx_csum)
1129 1130
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
1131 1132
	struct geneve_dev *t, *geneve = netdev_priv(dev);
	bool tun_collect_md, tun_on_same_port;
P
Paolo Abeni 已提交
1133
	int err, encap_len;
1134

1135
	if (metadata && !is_tnl_info_zero(info))
1136
		return -EINVAL;
1137 1138 1139 1140

	geneve->net = net;
	geneve->dev = dev;

1141
	t = geneve_find_dev(gn, info, &tun_on_same_port, &tun_collect_md);
1142 1143 1144
	if (t)
		return -EBUSY;

P
Paolo Abeni 已提交
1145 1146
	/* make enough headroom for basic scenario */
	encap_len = GENEVE_BASE_HLEN + ETH_HLEN;
1147
	if (!metadata && ip_tunnel_info_af(info) == AF_INET) {
P
Paolo Abeni 已提交
1148
		encap_len += sizeof(struct iphdr);
1149 1150
		dev->max_mtu -= sizeof(struct iphdr);
	} else {
P
Paolo Abeni 已提交
1151
		encap_len += sizeof(struct ipv6hdr);
1152 1153
		dev->max_mtu -= sizeof(struct ipv6hdr);
	}
P
Paolo Abeni 已提交
1154 1155
	dev->needed_headroom = encap_len + ETH_HLEN;

1156 1157 1158 1159 1160 1161 1162 1163
	if (metadata) {
		if (tun_on_same_port)
			return -EPERM;
	} else {
		if (tun_collect_md)
			return -EPERM;
	}

1164 1165 1166 1167
	dst_cache_reset(&geneve->info.dst_cache);
	geneve->info = *info;
	geneve->collect_md = metadata;
	geneve->use_udp6_rx_checksums = ipv6_rx_csum;
P
Paolo Abeni 已提交
1168

1169 1170 1171 1172
	err = register_netdevice(dev);
	if (err)
		return err;

1173 1174 1175 1176
	list_add(&geneve->next, &gn->geneve_list);
	return 0;
}

1177 1178 1179 1180 1181 1182
static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port)
{
	memset(info, 0, sizeof(*info));
	info->key.tp_dst = htons(dst_port);
}

1183
static int geneve_newlink(struct net *net, struct net_device *dev,
1184 1185
			  struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
1186
{
1187 1188
	bool use_udp6_rx_checksums = false;
	struct ip_tunnel_info info;
1189
	bool metadata = false;
1190 1191

	init_tnl_info(&info, GENEVE_UDP_PORT);
1192

1193 1194 1195 1196
	if (data[IFLA_GENEVE_REMOTE] && data[IFLA_GENEVE_REMOTE6])
		return -EINVAL;

	if (data[IFLA_GENEVE_REMOTE]) {
1197
		info.key.u.ipv4.dst =
1198
			nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
1199 1200 1201 1202 1203

		if (IN_MULTICAST(ntohl(info.key.u.ipv4.dst))) {
			netdev_dbg(dev, "multicast remote is unsupported\n");
			return -EINVAL;
		}
1204 1205 1206
	}

	if (data[IFLA_GENEVE_REMOTE6]) {
1207 1208 1209
 #if IS_ENABLED(CONFIG_IPV6)
		info.mode = IP_TUNNEL_INFO_IPV6;
		info.key.u.ipv6.dst =
1210 1211
			nla_get_in6_addr(data[IFLA_GENEVE_REMOTE6]);

1212
		if (ipv6_addr_type(&info.key.u.ipv6.dst) &
1213 1214 1215 1216
		    IPV6_ADDR_LINKLOCAL) {
			netdev_dbg(dev, "link-local remote is unsupported\n");
			return -EINVAL;
		}
1217 1218 1219 1220 1221 1222 1223 1224 1225
		if (ipv6_addr_is_multicast(&info.key.u.ipv6.dst)) {
			netdev_dbg(dev, "multicast remote is unsupported\n");
			return -EINVAL;
		}
		info.key.tun_flags |= TUNNEL_CSUM;
		use_udp6_rx_checksums = true;
#else
		return -EPFNOSUPPORT;
#endif
1226 1227
	}

1228 1229 1230 1231
	if (data[IFLA_GENEVE_ID]) {
		__u32 vni;
		__u8 tvni[3];

1232
		vni = nla_get_u32(data[IFLA_GENEVE_ID]);
1233 1234 1235
		tvni[0] = (vni & 0x00ff0000) >> 16;
		tvni[1] = (vni & 0x0000ff00) >> 8;
		tvni[2] =  vni & 0x000000ff;
1236

1237 1238
		info.key.tun_id = vni_to_tunnel_id(tvni);
	}
1239
	if (data[IFLA_GENEVE_TTL])
1240
		info.key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
1241

1242
	if (data[IFLA_GENEVE_TOS])
1243
		info.key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
1244

1245 1246 1247 1248 1249 1250
	if (data[IFLA_GENEVE_LABEL]) {
		info.key.label = nla_get_be32(data[IFLA_GENEVE_LABEL]) &
				  IPV6_FLOWLABEL_MASK;
		if (info.key.label && (!(info.mode & IP_TUNNEL_INFO_IPV6)))
			return -EINVAL;
	}
1251

1252
	if (data[IFLA_GENEVE_PORT])
1253
		info.key.tp_dst = nla_get_be16(data[IFLA_GENEVE_PORT]);
1254

1255 1256
	if (data[IFLA_GENEVE_COLLECT_METADATA])
		metadata = true;
1257

1258
	if (data[IFLA_GENEVE_UDP_CSUM] &&
1259
	    nla_get_u8(data[IFLA_GENEVE_UDP_CSUM]))
1260
		info.key.tun_flags |= TUNNEL_CSUM;
1261 1262 1263

	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX] &&
	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_TX]))
1264
		info.key.tun_flags &= ~TUNNEL_CSUM;
1265 1266 1267

	if (data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX] &&
	    nla_get_u8(data[IFLA_GENEVE_UDP_ZERO_CSUM6_RX]))
1268
		use_udp6_rx_checksums = false;
1269

1270
	return geneve_configure(net, dev, &info, metadata, use_udp6_rx_checksums);
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
}

static void geneve_dellink(struct net_device *dev, struct list_head *head)
{
	struct geneve_dev *geneve = netdev_priv(dev);

	list_del(&geneve->next);
	unregister_netdevice_queue(dev, head);
}

static size_t geneve_get_size(const struct net_device *dev)
{
	return nla_total_size(sizeof(__u32)) +	/* IFLA_GENEVE_ID */
1284
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_GENEVE_REMOTE{6} */
1285
		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
1286
		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
1287
		nla_total_size(sizeof(__be32)) +  /* IFLA_GENEVE_LABEL */
1288
		nla_total_size(sizeof(__be16)) +  /* IFLA_GENEVE_PORT */
1289
		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
1290 1291 1292
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */
1293 1294 1295 1296 1297 1298
		0;
}

static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	struct geneve_dev *geneve = netdev_priv(dev);
1299 1300
	struct ip_tunnel_info *info = &geneve->info;
	__u8 tmp_vni[3];
1301 1302
	__u32 vni;

1303 1304
	tunnel_id_to_vni(info->key.tun_id, tmp_vni);
	vni = (tmp_vni[0] << 16) | (tmp_vni[1] << 8) | tmp_vni[2];
1305 1306 1307
	if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
		goto nla_put_failure;

1308
	if (rtnl_dereference(geneve->sock4)) {
1309
		if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
1310 1311 1312 1313 1314
				    info->key.u.ipv4.dst))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_CSUM,
			       !!(info->key.tun_flags & TUNNEL_CSUM)))
1315
			goto nla_put_failure;
1316

1317 1318
	}

1319
#if IS_ENABLED(CONFIG_IPV6)
1320
	if (rtnl_dereference(geneve->sock6)) {
1321
		if (nla_put_in6_addr(skb, IFLA_GENEVE_REMOTE6,
1322 1323 1324 1325 1326 1327 1328 1329 1330
				     &info->key.u.ipv6.dst))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_TX,
			       !(info->key.tun_flags & TUNNEL_CSUM)))
			goto nla_put_failure;

		if (nla_put_u8(skb, IFLA_GENEVE_UDP_ZERO_CSUM6_RX,
			       !geneve->use_udp6_rx_checksums))
1331 1332
			goto nla_put_failure;
	}
1333
#endif
1334

1335 1336 1337
	if (nla_put_u8(skb, IFLA_GENEVE_TTL, info->key.ttl) ||
	    nla_put_u8(skb, IFLA_GENEVE_TOS, info->key.tos) ||
	    nla_put_be32(skb, IFLA_GENEVE_LABEL, info->key.label))
1338 1339
		goto nla_put_failure;

1340
	if (nla_put_be16(skb, IFLA_GENEVE_PORT, info->key.tp_dst))
1341 1342
		goto nla_put_failure;

1343 1344 1345 1346
	if (geneve->collect_md) {
		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
			goto nla_put_failure;
	}
1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

static struct rtnl_link_ops geneve_link_ops __read_mostly = {
	.kind		= "geneve",
	.maxtype	= IFLA_GENEVE_MAX,
	.policy		= geneve_policy,
	.priv_size	= sizeof(struct geneve_dev),
	.setup		= geneve_setup,
	.validate	= geneve_validate,
	.newlink	= geneve_newlink,
	.dellink	= geneve_dellink,
	.get_size	= geneve_get_size,
	.fill_info	= geneve_fill_info,
};

1366 1367 1368 1369
struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
					u8 name_assign_type, u16 dst_port)
{
	struct nlattr *tb[IFLA_MAX + 1];
1370
	struct ip_tunnel_info info;
1371
	struct net_device *dev;
1372
	LIST_HEAD(list_kill);
1373 1374 1375 1376 1377 1378 1379 1380
	int err;

	memset(tb, 0, sizeof(tb));
	dev = rtnl_create_link(net, name, name_assign_type,
			       &geneve_link_ops, tb);
	if (IS_ERR(dev))
		return dev;

1381 1382
	init_tnl_info(&info, dst_port);
	err = geneve_configure(net, dev, &info, true, true);
1383 1384 1385 1386
	if (err) {
		free_netdev(dev);
		return ERR_PTR(err);
	}
1387 1388 1389 1390

	/* openvswitch users expect packet sizes to be unrestricted,
	 * so set the largest MTU we can.
	 */
1391
	err = geneve_change_mtu(dev, IP_MAX_MTU);
1392 1393 1394
	if (err)
		goto err;

1395 1396 1397 1398
	err = rtnl_configure_link(dev, NULL);
	if (err < 0)
		goto err;

1399
	return dev;
1400
err:
1401 1402
	geneve_dellink(dev, &list_kill);
	unregister_netdevice_many(&list_kill);
1403
	return ERR_PTR(err);
1404 1405 1406
}
EXPORT_SYMBOL_GPL(geneve_dev_create_fb);

1407 1408 1409 1410 1411
static int geneve_netdevice_event(struct notifier_block *unused,
				  unsigned long event, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);

1412
	if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
1413 1414 1415 1416 1417 1418 1419 1420 1421
		geneve_push_rx_ports(dev);

	return NOTIFY_DONE;
}

static struct notifier_block geneve_notifier_block __read_mostly = {
	.notifier_call = geneve_netdevice_event,
};

1422 1423 1424 1425 1426
static __net_init int geneve_init_net(struct net *net)
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);

	INIT_LIST_HEAD(&gn->geneve_list);
1427
	INIT_LIST_HEAD(&gn->sock_list);
1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
	return 0;
}

static void __net_exit geneve_exit_net(struct net *net)
{
	struct geneve_net *gn = net_generic(net, geneve_net_id);
	struct geneve_dev *geneve, *next;
	struct net_device *dev, *aux;
	LIST_HEAD(list);

	rtnl_lock();

	/* gather any geneve devices that were moved into this ns */
	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &geneve_link_ops)
			unregister_netdevice_queue(dev, &list);

	/* now gather any other geneve devices that were created in this ns */
	list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
		/* If geneve->dev is in the same netns, it was already added
		 * to the list by the previous loop.
		 */
		if (!net_eq(dev_net(geneve->dev), net))
			unregister_netdevice_queue(geneve->dev, &list);
	}

	/* unregister the devices gathered above */
	unregister_netdevice_many(&list);
	rtnl_unlock();
}

static struct pernet_operations geneve_net_ops = {
	.init = geneve_init_net,
	.exit = geneve_exit_net,
	.id   = &geneve_net_id,
	.size = sizeof(struct geneve_net),
};

static int __init geneve_init_module(void)
{
	int rc;

	rc = register_pernet_subsys(&geneve_net_ops);
	if (rc)
		goto out1;

1474
	rc = register_netdevice_notifier(&geneve_notifier_block);
1475 1476 1477
	if (rc)
		goto out2;

1478 1479 1480 1481
	rc = rtnl_link_register(&geneve_link_ops);
	if (rc)
		goto out3;

1482
	return 0;
1483 1484
out3:
	unregister_netdevice_notifier(&geneve_notifier_block);
1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
out2:
	unregister_pernet_subsys(&geneve_net_ops);
out1:
	return rc;
}
late_initcall(geneve_init_module);

static void __exit geneve_cleanup_module(void)
{
	rtnl_link_unregister(&geneve_link_ops);
1495
	unregister_netdevice_notifier(&geneve_notifier_block);
1496 1497 1498 1499 1500 1501 1502 1503 1504
	unregister_pernet_subsys(&geneve_net_ops);
}
module_exit(geneve_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(GENEVE_NETDEV_VER);
MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic");
MODULE_ALIAS_RTNL_LINK("geneve");