vxlan.c 106.6 KB
Newer Older
S
stephen hemminger 已提交
1
/*
R
Rami Rosen 已提交
2
 * VXLAN: Virtual eXtensible Local Area Network
S
stephen hemminger 已提交
3
 *
4
 * Copyright (c) 2012-2013 Vyatta Inc.
S
stephen hemminger 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Y
Yan Burman 已提交
20
#include <linux/ethtool.h>
D
David Stevens 已提交
21 22
#include <net/arp.h>
#include <net/ndisc.h>
S
stephen hemminger 已提交
23 24 25 26 27 28
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
29
#include <net/tun_proto.h>
30
#include <net/vxlan.h>
31

C
Cong Wang 已提交
32 33
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
34
#include <net/ip6_checksum.h>
C
Cong Wang 已提交
35
#endif
S
stephen hemminger 已提交
36 37 38

#define VXLAN_VERSION	"0.1"

39 40
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
S
stephen hemminger 已提交
41 42 43
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

44 45
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
S
Stephen Hemminger 已提交
46
 * for compatibility with early adopters.
47
 */
48 49
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
S
stephen hemminger 已提交
50 51 52 53 54 55
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

56
static unsigned int vxlan_net_id;
57
static struct rtnl_link_ops vxlan_link_ops;
58

59
static const u8 all_zeros_mac[ETH_ALEN + 2];
60

61
static int vxlan_sock_add(struct vxlan_dev *vxlan);
62

63 64
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

65 66 67 68
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
69
	spinlock_t	  sock_lock;
70 71
};

S
stephen hemminger 已提交
72 73 74 75 76 77
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
78
	struct list_head  remotes;
79
	u8		  eth_addr[ETH_ALEN];
S
stephen hemminger 已提交
80
	u16		  state;	/* see ndm_state */
81
	__be32		  vni;
P
Petr Machata 已提交
82
	u16		  flags;	/* see ndm_flags and below */
S
stephen hemminger 已提交
83 84
};

P
Petr Machata 已提交
85 86
#define NTF_VXLAN_ADDED_BY_USER 0x100

S
stephen hemminger 已提交
87 88 89
/* salt for hash table */
static u32 vxlan_salt __read_mostly;

T
Thomas Graf 已提交
90 91
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
92 93
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
T
Thomas Graf 已提交
94 95
}

C
Cong Wang 已提交
96 97 98 99
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
100 101 102 103 104 105
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
106 107 108 109
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
110
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
111
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
J
Jiri Benc 已提交
112 113 114
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
115
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
116 117 118 119 120
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
121 122 123
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
124
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
125
{
J
Jiri Benc 已提交
126
	if (ip->sa.sa_family == AF_INET6)
127
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
J
Jiri Benc 已提交
128
	else
129
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
130 131 132 133 134 135 136
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
137
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
138 139 140 141
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
142 143 144
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
145
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
146 147 148 149 150
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
151 152 153
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
154
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
155
{
156
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
157 158 159
}
#endif

160
/* Virtual Network hash table head */
161
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
162
{
163
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
164 165 166 167
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
S
stephen hemminger 已提交
168 169 170
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

171 172 173
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

174 175 176
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
177
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
178
{
179 180 181 182 183 184
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
185 186
}

187 188 189 190
/* Find VXLAN socket based on network namespace, address family and UDP port
 * and enabled unshareable flags.
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
191
					  __be16 port, u32 flags, int ifindex)
192 193
{
	struct vxlan_sock *vs;
194 195

	flags &= VXLAN_F_RCV_FLAGS;
196 197

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
198
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
199
		    vxlan_get_sk_family(vs) == family &&
200 201
		    vs->flags == flags &&
		    vs->sock->sk->sk_bound_dev_if == ifindex)
202 203 204
			return vs;
	}
	return NULL;
S
stephen hemminger 已提交
205 206
}

207 208
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
S
stephen hemminger 已提交
209
{
J
Jiri Benc 已提交
210
	struct vxlan_dev_node *node;
S
stephen hemminger 已提交
211

212 213 214 215
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

J
Jiri Benc 已提交
216 217
	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
218 219 220
			continue;

		if (IS_ENABLED(CONFIG_IPV6)) {
J
Jiri Benc 已提交
221
			const struct vxlan_config *cfg = &node->vxlan->cfg;
222 223 224 225 226 227

			if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
			    cfg->remote_ifindex != ifindex)
				continue;
		}

J
Jiri Benc 已提交
228
		return node->vxlan;
S
stephen hemminger 已提交
229 230 231 232 233
	}

	return NULL;
}

P
Pravin B Shelar 已提交
234
/* Look up VNI in a per net namespace table */
235 236 237
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
					__be32 vni, sa_family_t family,
					__be16 port, u32 flags)
P
Pravin B Shelar 已提交
238 239 240
{
	struct vxlan_sock *vs;

241
	vs = vxlan_find_sock(net, family, port, flags, ifindex);
P
Pravin B Shelar 已提交
242 243 244
	if (!vs)
		return NULL;

245
	return vxlan_vs_find_vni(vs, ifindex, vni);
P
Pravin B Shelar 已提交
246 247
}

S
stephen hemminger 已提交
248 249
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
S
Stephen Hemminger 已提交
250 251 252
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
S
stephen hemminger 已提交
253 254 255 256 257
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
	struct nlmsghdr *nlh;
	struct ndmsg *ndm;
D
David Stevens 已提交
258
	bool send_ip, send_eth;
S
stephen hemminger 已提交
259 260 261 262 263 264 265

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
D
David Stevens 已提交
266 267 268 269

	send_eth = send_ip = true;

	if (type == RTM_GETNEIGH) {
C
Cong Wang 已提交
270
		send_ip = !vxlan_addr_any(&rdst->remote_ip);
D
David Stevens 已提交
271
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
272
		ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
D
David Stevens 已提交
273 274
	} else
		ndm->ndm_family	= AF_BRIDGE;
S
stephen hemminger 已提交
275 276
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
277
	ndm->ndm_flags = fdb->flags;
278 279
	if (rdst->offloaded)
		ndm->ndm_flags |= NTF_OFFLOADED;
280
	ndm->ndm_type = RTN_UNICAST;
S
stephen hemminger 已提交
281

282
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
283
	    nla_put_s32(skb, NDA_LINK_NETNSID,
284
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
285 286
		goto nla_put_failure;

D
David Stevens 已提交
287
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
S
stephen hemminger 已提交
288 289
		goto nla_put_failure;

C
Cong Wang 已提交
290
	if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip))
291 292
		goto nla_put_failure;

293
	if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port &&
294 295
	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
		goto nla_put_failure;
296
	if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
297
	    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
298
		goto nla_put_failure;
299
	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
300 301 302
	    nla_put_u32(skb, NDA_SRC_VNI,
			be32_to_cpu(fdb->vni)))
		goto nla_put_failure;
303 304
	if (rdst->remote_ifindex &&
	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
S
stephen hemminger 已提交
305 306 307 308 309 310 311 312 313 314
		goto nla_put_failure;

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

315 316
	nlmsg_end(skb, nlh);
	return 0;
S
stephen hemminger 已提交
317 318 319 320 321 322 323 324 325 326

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
C
Cong Wang 已提交
327
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
328
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
329 330
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
331
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
S
stephen hemminger 已提交
332 333 334
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

P
Petr Machata 已提交
335 336
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       struct vxlan_rdst *rd, int type)
S
stephen hemminger 已提交
337 338 339 340 341 342 343 344 345
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

346
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
S
stephen hemminger 已提交
347 348 349 350 351 352 353 354 355 356 357 358 359 360
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

361 362 363 364 365 366
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
			    const struct vxlan_fdb *fdb,
			    const struct vxlan_rdst *rd,
			    struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	fdb_info->info.dev = vxlan->dev;
367
	fdb_info->info.extack = NULL;
368 369 370 371 372 373 374 375 376 377
	fdb_info->remote_ip = rd->remote_ip;
	fdb_info->remote_port = rd->remote_port;
	fdb_info->remote_vni = rd->remote_vni;
	fdb_info->remote_ifindex = rd->remote_ifindex;
	memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
	fdb_info->vni = fdb->vni;
	fdb_info->offloaded = rd->offloaded;
	fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

P
Petr Machata 已提交
378 379 380 381 382 383 384 385 386 387 388 389 390
static void vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
					       struct vxlan_fdb *fdb,
					       struct vxlan_rdst *rd,
					       bool adding)
{
	struct switchdev_notifier_vxlan_fdb_info info;
	enum switchdev_notifier_type notifier_type;

	if (WARN_ON(!rd))
		return;

	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
391
	vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, &info);
P
Petr Machata 已提交
392 393 394 395 396
	call_switchdev_notifiers(notifier_type, vxlan->dev,
				 &info.info);
}

static void vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
397
			     struct vxlan_rdst *rd, int type, bool swdev_notify)
P
Petr Machata 已提交
398
{
399 400 401 402 403 404 405 406 407 408 409
	if (swdev_notify) {
		switch (type) {
		case RTM_NEWNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
							   true);
			break;
		case RTM_DELNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
							   false);
			break;
		}
P
Petr Machata 已提交
410 411 412 413 414
	}

	__vxlan_fdb_notify(vxlan, fdb, rd, type);
}

C
Cong Wang 已提交
415
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
D
David Stevens 已提交
416 417
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
418 419 420 421
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
C
Cong Wang 已提交
422
		.remote_ip = *ipa, /* goes to NDA_DST */
423
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
424
	};
425

426
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true);
D
David Stevens 已提交
427 428 429 430
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
431 432 433
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
434
	struct vxlan_rdst remote = { };
D
David Stevens 已提交
435 436 437

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

438
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true);
D
David Stevens 已提交
439 440
}

S
stephen hemminger 已提交
441 442 443 444 445 446 447 448
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
449 450
#else
	value <<= 16;
S
stephen hemminger 已提交
451 452 453 454
#endif
	return hash_64(value, FDB_HASH_BITS);
}

455 456 457 458 459 460 461 462
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));

	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

S
stephen hemminger 已提交
463 464
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
465
						const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
466
{
467
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
468 469 470
		return &vxlan->fdb_head[eth_vni_hash(mac, vni)];
	else
		return &vxlan->fdb_head[eth_hash(mac)];
S
stephen hemminger 已提交
471 472 473
}

/* Look up Ethernet address in forwarding table */
474
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
475
					  const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
476
{
477
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
S
stephen hemminger 已提交
478 479
	struct vxlan_fdb *f;

480
	hlist_for_each_entry_rcu(f, head, hlist) {
481
		if (ether_addr_equal(mac, f->eth_addr)) {
482
			if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
483 484 485 486 487 488
				if (vni == f->vni)
					return f;
			} else {
				return f;
			}
		}
S
stephen hemminger 已提交
489 490 491 492 493
	}

	return NULL;
}

494
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
495
					const u8 *mac, __be32 vni)
496 497 498
{
	struct vxlan_fdb *f;

499
	f = __vxlan_find_mac(vxlan, mac, vni);
500
	if (f && f->used != jiffies)
501 502 503 504 505
		f->used = jiffies;

	return f;
}

506 507
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
C
Cong Wang 已提交
508
					      union vxlan_addr *ip, __be16 port,
509
					      __be32 vni, __u32 ifindex)
510
{
511
	struct vxlan_rdst *rd;
512

513
	list_for_each_entry(rd, &f->remotes, list) {
C
Cong Wang 已提交
514
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
515 516 517
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
518
			return rd;
519
	}
520

521 522 523
	return NULL;
}

524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	u8 eth_addr[ETH_ALEN + 2] = { 0 };
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	int rc = 0;

	if (is_multicast_ether_addr(mac) ||
	    is_zero_ether_addr(mac))
		return -EINVAL;

	ether_addr_copy(eth_addr, mac);

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, eth_addr, vni);
	if (!f) {
		rc = -ENOENT;
		goto out;
	}

	rdst = first_remote_rcu(f);
548
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, fdb_info);
549 550 551 552 553 554 555

out:
	rcu_read_unlock();
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

P
Petr Machata 已提交
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602
static int vxlan_fdb_notify_one(struct notifier_block *nb,
				const struct vxlan_dev *vxlan,
				const struct vxlan_fdb *f,
				const struct vxlan_rdst *rdst)
{
	struct switchdev_notifier_vxlan_fdb_info fdb_info;
	int rc;

	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, &fdb_info);
	rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
			       &fdb_info);
	return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
		     struct notifier_block *nb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;
	int rc = 0;

	if (!netif_is_vxlan(dev))
		return -EINVAL;
	vxlan = netdev_priv(dev);

	spin_lock_bh(&vxlan->hash_lock);
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
			if (f->vni == vni) {
				list_for_each_entry(rdst, &f->remotes, list) {
					rc = vxlan_fdb_notify_one(nb, vxlan,
								  f, rdst);
					if (rc)
						goto out;
				}
			}
		}
	}

out:
	spin_unlock_bh(&vxlan->hash_lock);
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;

	if (!netif_is_vxlan(dev))
		return;
	vxlan = netdev_priv(dev);

	spin_lock_bh(&vxlan->hash_lock);
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
			if (f->vni == vni)
				list_for_each_entry(rdst, &f->remotes, list)
					rdst->offloaded = false;
	}
	spin_unlock_bh(&vxlan->hash_lock);
}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

625 626
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
627 628
			     union vxlan_addr *ip, __be16 port, __be32 vni,
			     __u32 ifindex)
629 630 631 632 633 634 635 636 637 638
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
639 640

	dst_cache_reset(&rd->dst_cache);
C
Cong Wang 已提交
641
	rd->remote_ip = *ip;
642 643 644 645 646 647
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
	return 1;
}

648 649
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
650
			    union vxlan_addr *ip, __be16 port, __be32 vni,
651
			    __u32 ifindex, struct vxlan_rdst **rdp)
652 653 654 655 656 657 658
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

659 660 661
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
662 663 664 665 666 667

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

C
Cong Wang 已提交
668
	rd->remote_ip = *ip;
669
	rd->remote_port = port;
670
	rd->offloaded = false;
671 672
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
673 674 675

	list_add_tail_rcu(&rd->list, &f->remotes);

676
	*rdp = rd;
677 678 679
	return 1;
}

T
Tom Herbert 已提交
680 681 682
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
683 684
					  __be32 vni_field,
					  struct gro_remcsum *grc,
685
					  bool nopartial)
T
Tom Herbert 已提交
686
{
687
	size_t start, offset;
T
Tom Herbert 已提交
688 689

	if (skb->remcsum_offload)
690
		return vh;
T
Tom Herbert 已提交
691 692 693 694

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

695 696
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
T
Tom Herbert 已提交
697

698 699
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
T
Tom Herbert 已提交
700 701 702 703 704 705

	skb->remcsum_offload = 1;

	return vh;
}

706 707 708
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
					 struct list_head *head,
					 struct sk_buff *skb)
709
{
710 711
	struct sk_buff *pp = NULL;
	struct sk_buff *p;
712
	struct vxlanhdr *vh, *vh2;
713
	unsigned int hlen, off_vx;
714
	int flush = 1;
715
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
716
	__be32 flags;
717 718 719
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
720 721 722 723 724 725 726 727 728 729

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

T
Tom Herbert 已提交
730 731
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

732
	flags = vh->vx_flags;
T
Tom Herbert 已提交
733 734 735

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
736
				       vh->vx_vni, &grc,
737 738
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
T
Tom Herbert 已提交
739 740 741 742 743

		if (!vh)
			goto out;
	}

744 745
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

746
	list_for_each_entry(p, head, list) {
747 748 749 750
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
T
Thomas Graf 已提交
751 752
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
753 754 755 756 757
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

S
Sabrina Dubroca 已提交
758
	pp = call_gro_receive(eth_gro_receive, head, skb);
759
	flush = 0;
760 761

out:
762
	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
763 764 765 766

	return pp;
}

767
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
768
{
769 770 771
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
772
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
773 774
}

775 776
static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan,
					 const u8 *mac, __u16 state,
P
Petr Machata 已提交
777
					 __be32 src_vni, __u16 ndm_flags)
778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
{
	struct vxlan_fdb *f;

	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	if (!f)
		return NULL;
	f->state = state;
	f->flags = ndm_flags;
	f->updated = f->used = jiffies;
	f->vni = src_vni;
	INIT_LIST_HEAD(&f->remotes);
	memcpy(f->eth_addr, mac, ETH_ALEN);

	return f;
}

S
stephen hemminger 已提交
794
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
795 796
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __be16 port, __be32 src_vni,
P
Petr Machata 已提交
797
			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829
			    struct vxlan_fdb **fdb)
{
	struct vxlan_rdst *rd = NULL;
	struct vxlan_fdb *f;
	int rc;

	if (vxlan->cfg.addrmax &&
	    vxlan->addrcnt >= vxlan->cfg.addrmax)
		return -ENOSPC;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
	f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
	if (!f)
		return -ENOMEM;

	rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
	if (rc < 0) {
		kfree(f);
		return rc;
	}

	++vxlan->addrcnt;
	hlist_add_head_rcu(&f->hlist,
			   vxlan_fdb_head(vxlan, mac, src_vni));

	*fdb = f;

	return 0;
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
C
Cong Wang 已提交
830
			    const u8 *mac, union vxlan_addr *ip,
831
			    __u16 state, __u16 flags,
832
			    __be16 port, __be32 src_vni, __be32 vni,
P
Petr Machata 已提交
833
			    __u32 ifindex, __u16 ndm_flags,
834
			    bool swdev_notify)
S
stephen hemminger 已提交
835
{
P
Petr Machata 已提交
836
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
837
	struct vxlan_rdst *rd = NULL;
S
stephen hemminger 已提交
838 839
	struct vxlan_fdb *f;
	int notify = 0;
840
	int rc;
S
stephen hemminger 已提交
841

842
	f = __vxlan_find_mac(vxlan, mac, src_vni);
S
stephen hemminger 已提交
843 844 845 846 847 848
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864

		/* Do not allow an externally learned entry to take over an
		 * entry added by the user.
		 */
		if (!(fdb_flags & NTF_EXT_LEARNED) ||
		    !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
			if (f->state != state) {
				f->state = state;
				f->updated = jiffies;
				notify = 1;
			}
			if (f->flags != fdb_flags) {
				f->flags = fdb_flags;
				f->updated = jiffies;
				notify = 1;
			}
865
		}
866

867 868 869 870
		if ((flags & NLM_F_REPLACE)) {
			/* Only change unicasts */
			if (!(is_multicast_ether_addr(f->eth_addr) ||
			     is_zero_ether_addr(f->eth_addr))) {
871
				notify |= vxlan_fdb_replace(f, ip, port, vni,
872 873 874 875
							   ifindex);
			} else
				return -EOPNOTSUPP;
		}
876
		if ((flags & NLM_F_APPEND) &&
877 878
		    (is_multicast_ether_addr(f->eth_addr) ||
		     is_zero_ether_addr(f->eth_addr))) {
879
			rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
880 881 882 883 884

			if (rc < 0)
				return rc;
			notify |= rc;
		}
885 886 887

		if (ndm_flags & NTF_USE)
			f->used = jiffies;
S
stephen hemminger 已提交
888 889 890 891
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

892 893 894 895 896
		/* Disallow replace to add a multicast entry */
		if ((flags & NLM_F_REPLACE) &&
		    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
			return -EOPNOTSUPP;

C
Cong Wang 已提交
897
		netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
898
		rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
899
				      vni, ifindex, fdb_flags, &f);
900
		if (rc < 0)
901
			return rc;
902
		notify = 1;
S
stephen hemminger 已提交
903 904
	}

905 906 907
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
908
		vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH, swdev_notify);
909
	}
S
stephen hemminger 已提交
910 911 912 913

	return 0;
}

W
Wei Yongjun 已提交
914
static void vxlan_fdb_free(struct rcu_head *head)
915 916
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
917
	struct vxlan_rdst *rd, *nd;
918

919 920
	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
921
		kfree(rd);
922
	}
923 924 925
	kfree(f);
}

926
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
927
			      bool do_notify, bool swdev_notify)
S
stephen hemminger 已提交
928
{
929 930
	struct vxlan_rdst *rd;

S
stephen hemminger 已提交
931 932 933 934
	netdev_dbg(vxlan->dev,
		    "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
935
	if (do_notify)
936
		list_for_each_entry(rd, &f->remotes, list)
937 938
			vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
					 swdev_notify);
S
stephen hemminger 已提交
939 940

	hlist_del_rcu(&f->hlist);
941
	call_rcu(&f->rcu, vxlan_fdb_free);
S
stephen hemminger 已提交
942 943
}

944 945 946 947 948 949 950 951 952
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
953
				  struct vxlan_rdst *rd, bool swdev_notify)
954 955
{
	list_del_rcu(&rd->list);
956
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify);
957 958 959
	call_rcu(&rd->rcu, vxlan_dst_free);
}

M
Mike Rapoport 已提交
960
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
961 962
			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
			   __be32 *vni, u32 *ifindex)
S
stephen hemminger 已提交
963
{
964
	struct net *net = dev_net(vxlan->dev);
C
Cong Wang 已提交
965
	int err;
S
stephen hemminger 已提交
966

M
Mike Rapoport 已提交
967
	if (tb[NDA_DST]) {
C
Cong Wang 已提交
968 969 970
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
M
Mike Rapoport 已提交
971
	} else {
C
Cong Wang 已提交
972 973 974 975 976 977 978 979 980 981
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
M
Mike Rapoport 已提交
982
	}
S
stephen hemminger 已提交
983

984
	if (tb[NDA_PORT]) {
985
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
986
			return -EINVAL;
M
Mike Rapoport 已提交
987 988
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
989
		*port = vxlan->cfg.dst_port;
M
Mike Rapoport 已提交
990
	}
991 992 993 994

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
995
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
M
Mike Rapoport 已提交
996 997 998
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
999

1000 1001 1002 1003 1004 1005 1006 1007
	if (tb[NDA_SRC_VNI]) {
		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
			return -EINVAL;
		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
	} else {
		*src_vni = vxlan->default_dst.remote_vni;
	}

1008
	if (tb[NDA_IFINDEX]) {
P
Pravin B Shelar 已提交
1009
		struct net_device *tdev;
1010 1011 1012

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
M
Mike Rapoport 已提交
1013
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1014
		tdev = __dev_get_by_index(net, *ifindex);
P
Pravin B Shelar 已提交
1015
		if (!tdev)
1016
			return -EADDRNOTAVAIL;
M
Mike Rapoport 已提交
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
	} else {
		*ifindex = 0;
	}

	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
1027
			 const unsigned char *addr, u16 vid, u16 flags)
M
Mike Rapoport 已提交
1028 1029 1030
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
C
Cong Wang 已提交
1031
	union vxlan_addr ip;
M
Mike Rapoport 已提交
1032
	__be16 port;
1033
	__be32 src_vni, vni;
1034
	u32 ifindex;
M
Mike Rapoport 已提交
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

	if (tb[NDA_DST] == NULL)
		return -EINVAL;

1046
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
M
Mike Rapoport 已提交
1047 1048
	if (err)
		return err;
1049

1050 1051 1052
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

S
stephen hemminger 已提交
1053
	spin_lock_bh(&vxlan->hash_lock);
1054
	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
P
Petr Machata 已提交
1055 1056
			       port, src_vni, vni, ifindex,
			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1057
			       true);
S
stephen hemminger 已提交
1058 1059 1060 1061 1062
	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

1063 1064
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
			      const unsigned char *addr, union vxlan_addr ip,
1065
			      __be16 port, __be32 src_vni, __be32 vni,
1066
			      u32 ifindex, bool swdev_notify)
S
stephen hemminger 已提交
1067 1068
{
	struct vxlan_fdb *f;
1069
	struct vxlan_rdst *rd = NULL;
1070
	int err = -ENOENT;
1071

1072
	f = vxlan_find_mac(vxlan, addr, src_vni);
1073
	if (!f)
1074
		return err;
1075

C
Cong Wang 已提交
1076 1077
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1078 1079 1080 1081 1082 1083 1084 1085
		if (!rd)
			goto out;
	}

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
1086
		vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1087
		goto out;
S
stephen hemminger 已提交
1088
	}
1089

1090
	vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1091 1092

out:
1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112
	return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
			    const unsigned char *addr, u16 vid)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	union vxlan_addr ip;
	__be32 src_vni, vni;
	__be16 port;
	u32 ifindex;
	int err;

	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex);
	if (err)
		return err;

	spin_lock_bh(&vxlan->hash_lock);
1113 1114
	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
				 true);
S
stephen hemminger 已提交
1115 1116 1117 1118 1119 1120 1121
	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1122
			  struct net_device *dev,
1123
			  struct net_device *filter_dev, int *idx)
S
stephen hemminger 已提交
1124 1125 1126
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
1127
	int err = 0;
S
stephen hemminger 已提交
1128 1129 1130 1131

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

1132
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1133 1134
			struct vxlan_rdst *rd;

1135
			list_for_each_entry_rcu(rd, &f->remotes, list) {
1136
				if (*idx < cb->args[2])
1137 1138
					goto skip;

1139 1140 1141 1142 1143
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
1144
				if (err < 0)
1145 1146
					goto out;
skip:
1147
				*idx += 1;
1148
			}
S
stephen hemminger 已提交
1149 1150
		}
	}
1151
out:
1152
	return err;
S
stephen hemminger 已提交
1153 1154
}

R
Roopa Prabhu 已提交
1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
static int vxlan_fdb_get(struct sk_buff *skb,
			 struct nlattr *tb[],
			 struct net_device *dev,
			 const unsigned char *addr,
			 u16 vid, u32 portid, u32 seq,
			 struct netlink_ext_ack *extack)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	__be32 vni;
	int err;

	if (tb[NDA_VNI])
		vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
	else
		vni = vxlan->default_dst.remote_vni;

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, addr, vni);
	if (!f) {
		NL_SET_ERR_MSG(extack, "Fdb entry not found");
		err = -ENOENT;
		goto errout;
	}

	err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
			     RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
	rcu_read_unlock();
	return err;
}

S
stephen hemminger 已提交
1188 1189
/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
1190
 * Return true if packet is bogus and should be dropped.
S
stephen hemminger 已提交
1191
 */
1192
static bool vxlan_snoop(struct net_device *dev,
1193
			union vxlan_addr *src_ip, const u8 *src_mac,
1194
			u32 src_ifindex, __be32 vni)
S
stephen hemminger 已提交
1195 1196 1197
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
1198 1199 1200 1201 1202 1203 1204
	u32 ifindex = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (src_ip->sa.sa_family == AF_INET6 &&
	    (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
		ifindex = src_ifindex;
#endif
S
stephen hemminger 已提交
1205

1206
	f = vxlan_find_mac(vxlan, src_mac, vni);
S
stephen hemminger 已提交
1207
	if (likely(f)) {
1208
		struct vxlan_rdst *rdst = first_remote_rcu(f);
1209

1210 1211
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
			   rdst->remote_ifindex == ifindex))
1212 1213 1214
			return false;

		/* Don't migrate static entries, drop packets */
1215
		if (f->state & (NUD_PERMANENT | NUD_NOARP))
1216
			return true;
S
stephen hemminger 已提交
1217 1218 1219

		if (net_ratelimit())
			netdev_info(dev,
C
Cong Wang 已提交
1220
				    "%pM migrated from %pIS to %pIS\n",
1221
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
S
stephen hemminger 已提交
1222

C
Cong Wang 已提交
1223
		rdst->remote_ip = *src_ip;
S
stephen hemminger 已提交
1224
		f->updated = jiffies;
1225
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true);
S
stephen hemminger 已提交
1226 1227 1228
	} else {
		/* learned new entry */
		spin_lock(&vxlan->hash_lock);
1229 1230 1231

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
1232
			vxlan_fdb_update(vxlan, src_mac, src_ip,
1233 1234
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
1235
					 vxlan->cfg.dst_port,
1236
					 vni,
1237
					 vxlan->default_dst.remote_vni,
1238
					 ifindex, NTF_SELF, true);
S
stephen hemminger 已提交
1239 1240
		spin_unlock(&vxlan->hash_lock);
	}
1241 1242

	return false;
S
stephen hemminger 已提交
1243 1244 1245
}

/* See if multicast group is already in use by other ID */
1246
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
S
stephen hemminger 已提交
1247
{
1248
	struct vxlan_dev *vxlan;
1249
	struct vxlan_sock *sock4;
A
Arnd Bergmann 已提交
1250 1251 1252
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
1253
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
S
stephen hemminger 已提交
1254

1255 1256
	sock4 = rtnl_dereference(dev->vn4_sock);

1257 1258 1259
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
1260
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1261
		return false;
1262
#if IS_ENABLED(CONFIG_IPV6)
1263
	sock6 = rtnl_dereference(dev->vn6_sock);
1264
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1265 1266
		return false;
#endif
1267

1268
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1269
		if (!netif_running(vxlan->dev) || vxlan == dev)
1270
			continue;
S
stephen hemminger 已提交
1271

1272 1273
		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
1274
			continue;
1275
#if IS_ENABLED(CONFIG_IPV6)
1276 1277
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
1278 1279
			continue;
#endif
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
1290
	}
S
stephen hemminger 已提交
1291 1292 1293 1294

	return false;
}

1295
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1296
{
1297
	struct vxlan_net *vn;
1298

1299
	if (!vs)
1300
		return false;
1301
	if (!refcount_dec_and_test(&vs->refcnt))
1302
		return false;
S
stephen hemminger 已提交
1303

1304
	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1305
	spin_lock(&vn->sock_lock);
1306
	hlist_del_rcu(&vs->hlist);
1307
	udp_tunnel_notify_del_rx_port(vs->sock,
1308 1309
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
1310
				      UDP_TUNNEL_TYPE_VXLAN);
1311 1312
	spin_unlock(&vn->sock_lock);

1313
	return true;
S
stephen hemminger 已提交
1314 1315
}

1316 1317
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
1318
	struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1319
#if IS_ENABLED(CONFIG_IPV6)
1320 1321
	struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

1322
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1323 1324
#endif

1325
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1326 1327
	synchronize_net();

1328 1329
	vxlan_vs_del_dev(vxlan);

1330 1331 1332
	if (__vxlan_sock_release_prep(sock4)) {
		udp_tunnel_sock_release(sock4->sock);
		kfree(sock4);
1333 1334 1335
	}

#if IS_ENABLED(CONFIG_IPV6)
1336 1337 1338
	if (__vxlan_sock_release_prep(sock6)) {
		udp_tunnel_sock_release(sock6->sock);
		kfree(sock6);
1339
	}
1340 1341 1342
#endif
}

1343
/* Update multicast group membership when first VNI on
1344
 * multicast address is brought up
1345
 */
1346
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1347
{
1348
	struct sock *sk;
C
Cong Wang 已提交
1349 1350
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1351
	int ret = -EINVAL;
S
stephen hemminger 已提交
1352

C
Cong Wang 已提交
1353
	if (ip->sa.sa_family == AF_INET) {
1354
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1355 1356 1357 1358 1359
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1360
		sk = sock4->sock->sk;
1361
		lock_sock(sk);
1362
		ret = ip_mc_join_group(sk, &mreq);
1363
		release_sock(sk);
C
Cong Wang 已提交
1364 1365
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1366 1367 1368
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1369
		lock_sock(sk);
1370 1371
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
1372
		release_sock(sk);
C
Cong Wang 已提交
1373 1374
#endif
	}
S
stephen hemminger 已提交
1375

1376
	return ret;
S
stephen hemminger 已提交
1377 1378 1379
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
1380
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1381
{
1382
	struct sock *sk;
C
Cong Wang 已提交
1383 1384
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1385
	int ret = -EINVAL;
S
stephen hemminger 已提交
1386

C
Cong Wang 已提交
1387
	if (ip->sa.sa_family == AF_INET) {
1388
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1389 1390 1391 1392 1393
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1394
		sk = sock4->sock->sk;
1395
		lock_sock(sk);
1396
		ret = ip_mc_leave_group(sk, &mreq);
1397
		release_sock(sk);
C
Cong Wang 已提交
1398 1399
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1400 1401 1402
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1403
		lock_sock(sk);
1404 1405
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
1406
		release_sock(sk);
C
Cong Wang 已提交
1407 1408
#endif
	}
S
stephen hemminger 已提交
1409

1410
	return ret;
S
stephen hemminger 已提交
1411 1412
}

1413 1414
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
T
Tom Herbert 已提交
1415
{
1416
	size_t start, offset;
T
Tom Herbert 已提交
1417

1418 1419
	if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
		goto out;
1420

1421 1422
	start = vxlan_rco_start(unparsed->vx_vni);
	offset = start + vxlan_rco_offset(unparsed->vx_vni);
T
Tom Herbert 已提交
1423

1424
	if (!pskb_may_pull(skb, offset + sizeof(u16)))
J
Jiri Benc 已提交
1425
		return false;
T
Tom Herbert 已提交
1426

J
Jiri Benc 已提交
1427 1428
	skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
			    !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1429 1430 1431
out:
	unparsed->vx_flags &= ~VXLAN_HF_RCO;
	unparsed->vx_vni &= VXLAN_VNI_MASK;
J
Jiri Benc 已提交
1432
	return true;
T
Tom Herbert 已提交
1433 1434
}

1435
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1436
				struct sk_buff *skb, u32 vxflags,
1437
				struct vxlan_metadata *md)
1438
{
1439
	struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1440
	struct metadata_dst *tun_dst;
1441 1442 1443

	if (!(unparsed->vx_flags & VXLAN_HF_GBP))
		goto out;
1444 1445 1446

	md->gbp = ntohs(gbp->policy_id);

1447
	tun_dst = (struct metadata_dst *)skb_dst(skb);
1448
	if (tun_dst) {
1449
		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1450 1451
		tun_dst->u.tun_info.options_len = sizeof(*md);
	}
1452 1453 1454 1455 1456
	if (gbp->dont_learn)
		md->gbp |= VXLAN_GBP_DONT_LEARN;

	if (gbp->policy_applied)
		md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1457

1458 1459 1460
	/* In flow-based mode, GBP is carried in dst_metadata */
	if (!(vxflags & VXLAN_F_COLLECT_METADATA))
		skb->mark = md->gbp;
1461 1462
out:
	unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1463 1464
}

J
Jiri Benc 已提交
1465
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
J
Jiri Benc 已提交
1466
				__be16 *protocol,
J
Jiri Benc 已提交
1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485
				struct sk_buff *skb, u32 vxflags)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;

	/* Need to have Next Protocol set for interfaces in GPE mode. */
	if (!gpe->np_applied)
		return false;
	/* "The initial version is 0. If a receiver does not support the
	 * version indicated it MUST drop the packet.
	 */
	if (gpe->version != 0)
		return false;
	/* "When the O bit is set to 1, the packet is an OAM packet and OAM
	 * processing MUST occur." However, we don't implement OAM
	 * processing, thus drop the packet.
	 */
	if (gpe->oam_flag)
		return false;

1486 1487
	*protocol = tun_p_to_eth_p(gpe->next_protocol);
	if (!*protocol)
J
Jiri Benc 已提交
1488 1489 1490 1491 1492 1493
		return false;

	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
	return true;
}

1494 1495
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
			  struct vxlan_sock *vs,
1496
			  struct sk_buff *skb, __be32 vni)
1497 1498
{
	union vxlan_addr saddr;
1499
	u32 ifindex = skb->dev->ifindex;
1500 1501 1502 1503 1504 1505 1506

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, vxlan->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1507
		return false;
1508

1509
	/* Get address from the outer IP header */
1510
	if (vxlan_get_sk_family(vs) == AF_INET) {
1511
		saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1512 1513 1514
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1515
		saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1516 1517 1518 1519
		saddr.sa.sa_family = AF_INET6;
#endif
	}

1520
	if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1521
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1522 1523 1524 1525 1526
		return false;

	return true;
}

1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
				  struct sk_buff *skb)
{
	int err = 0;

	if (vxlan_get_sk_family(vs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
	else
		err = IP6_ECN_decapsulate(oiph, skb);
#endif

	if (unlikely(err) && log_ecn_error) {
		if (vxlan_get_sk_family(vs) == AF_INET)
			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
					     &((struct iphdr *)oiph)->saddr,
					     ((struct iphdr *)oiph)->tos);
		else
			net_info_ratelimited("non-ECT from %pI6\n",
					     &((struct ipv6hdr *)oiph)->saddr);
	}
	return err <= 1;
}

S
stephen hemminger 已提交
1551
/* Callback from net/ipv4/udp.c to receive packets */
1552
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
S
stephen hemminger 已提交
1553
{
1554
	struct pcpu_sw_netstats *stats;
1555
	struct vxlan_dev *vxlan;
P
Pravin B Shelar 已提交
1556
	struct vxlan_sock *vs;
1557
	struct vxlanhdr unparsed;
T
Thomas Graf 已提交
1558 1559
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
J
Jiri Benc 已提交
1560
	__be16 protocol = htons(ETH_P_TEB);
J
Jiri Benc 已提交
1561
	bool raw_proto = false;
1562
	void *oiph;
1563
	__be32 vni = 0;
S
stephen hemminger 已提交
1564

J
Jiri Benc 已提交
1565
	/* Need UDP and VXLAN header to be present */
1566
	if (!pskb_may_pull(skb, VXLAN_HLEN))
1567
		goto drop;
S
stephen hemminger 已提交
1568

1569
	unparsed = *vxlan_hdr(skb);
J
Jiri Benc 已提交
1570 1571 1572 1573 1574 1575
	/* VNI flag always required to be set */
	if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
			   ntohl(vxlan_hdr(skb)->vx_flags),
			   ntohl(vxlan_hdr(skb)->vx_vni));
		/* Return non vxlan pkt */
1576
		goto drop;
S
stephen hemminger 已提交
1577
	}
J
Jiri Benc 已提交
1578 1579
	unparsed.vx_flags &= ~VXLAN_HF_VNI;
	unparsed.vx_vni &= ~VXLAN_VNI_MASK;
S
stephen hemminger 已提交
1580

1581
	vs = rcu_dereference_sk_user_data(sk);
P
Pravin B Shelar 已提交
1582
	if (!vs)
S
stephen hemminger 已提交
1583 1584
		goto drop;

1585 1586
	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

1587
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1588 1589 1590
	if (!vxlan)
		goto drop;

J
Jiri Benc 已提交
1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602
	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if (vs->flags & VXLAN_F_GPE) {
		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
			goto drop;
		raw_proto = true;
	}

	if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
				   !net_eq(vxlan->net, dev_net(vxlan->dev))))
			goto drop;
1603

T
Thomas Graf 已提交
1604
	if (vxlan_collect_metadata(vs)) {
1605
		struct metadata_dst *tun_dst;
J
Jiri Benc 已提交
1606

1607
		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1608
					 key32_to_tunnel_id(vni), sizeof(*md));
1609

T
Thomas Graf 已提交
1610 1611 1612
		if (!tun_dst)
			goto drop;

1613
		md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1614 1615

		skb_dst_set(skb, (struct dst_entry *)tun_dst);
T
Thomas Graf 已提交
1616 1617 1618 1619
	} else {
		memset(md, 0, sizeof(*md));
	}

1620 1621 1622 1623
	if (vs->flags & VXLAN_F_REMCSUM_RX)
		if (!vxlan_remcsum(&unparsed, skb, vs->flags))
			goto drop;
	if (vs->flags & VXLAN_F_GBP)
1624
		vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
J
Jiri Benc 已提交
1625 1626 1627
	/* Note that GBP and GPE can never be active together. This is
	 * ensured in vxlan_dev_configure.
	 */
T
Thomas Graf 已提交
1628

1629
	if (unparsed.vx_flags || unparsed.vx_vni) {
1630 1631 1632 1633
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
1634
		 * maintains compatibility with previous stack code, and also
1635 1636 1637
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */
J
Jiri Benc 已提交
1638
		goto drop;
1639 1640
	}

J
Jiri Benc 已提交
1641
	if (!raw_proto) {
1642
		if (!vxlan_set_mac(vxlan, vs, skb, vni))
J
Jiri Benc 已提交
1643 1644
			goto drop;
	} else {
1645
		skb_reset_mac_header(skb);
J
Jiri Benc 已提交
1646 1647 1648
		skb->dev = vxlan->dev;
		skb->pkt_type = PACKET_HOST;
	}
1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665

	oiph = skb_network_header(skb);
	skb_reset_network_header(skb);

	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		goto drop;
	}

	stats = this_cpu_ptr(vxlan->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

	gro_cells_receive(&vxlan->gro_cells, skb);
P
Pravin B Shelar 已提交
1666 1667 1668
	return 0;

drop:
J
Jiri Benc 已提交
1669 1670 1671
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
P
Pravin B Shelar 已提交
1672 1673
}

S
Stefano Brivio 已提交
1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr *hdr;
	__be32 vni;

	if (skb->len < VXLAN_HLEN)
		return -EINVAL;

	hdr = vxlan_hdr(skb);

	if (!(hdr->vx_flags & VXLAN_HF_VNI))
		return -EINVAL;

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	if (!vxlan)
		return -ENOENT;

	return 0;
}

1702
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
D
David Stevens 已提交
1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;
	struct neighbour *n;

	if (dev->flags & IFF_NOARP)
		goto out;

	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
		dev->stats.tx_dropped++;
		goto out;
	}
	parp = arp_hdr(skb);

	if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
	     parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
	    parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		goto out;
	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		goto out;

	n = neigh_lookup(&arp_tbl, &tip, dev);

	if (n) {
		struct vxlan_fdb *f;
		struct sk_buff	*reply;

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

1749
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
1750
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
D
David Stevens 已提交
1751 1752 1753 1754 1755 1756 1757 1758 1759 1760
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

		reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
				n->ha, sha);

		neigh_release(n);

1761 1762 1763
		if (reply == NULL)
			goto out;

D
David Stevens 已提交
1764 1765 1766 1767 1768 1769 1770
		skb_reset_mac_header(reply);
		__skb_pull(reply, skb_network_offset(reply));
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;
1771
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
C
Cong Wang 已提交
1772 1773
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
1774
			.sin.sin_family = AF_INET,
C
Cong Wang 已提交
1775 1776 1777 1778
		};

		vxlan_ip_miss(dev, &ipa);
	}
D
David Stevens 已提交
1779 1780 1781 1782 1783
out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}

C
Cong Wang 已提交
1784
#if IS_ENABLED(CONFIG_IPV6)
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
	struct neighbour *n, bool isrouter)
{
	struct net_device *dev = request->dev;
	struct sk_buff *reply;
	struct nd_msg *ns, *na;
	struct ipv6hdr *pip6;
	u8 *daddr;
	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
	int ns_olen;
	int i, len;

1797
	if (dev == NULL || !pskb_may_pull(request, request->len))
1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809
		return NULL;

	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
		sizeof(*na) + na_olen + dev->needed_tailroom;
	reply = alloc_skb(len, GFP_ATOMIC);
	if (reply == NULL)
		return NULL;

	reply->protocol = htons(ETH_P_IPV6);
	reply->dev = dev;
	skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
	skb_push(reply, sizeof(struct ethhdr));
1810
	skb_reset_mac_header(reply);
1811

1812
	ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
1813 1814

	daddr = eth_hdr(request)->h_source;
1815 1816
	ns_olen = request->len - skb_network_offset(request) -
		sizeof(struct ipv6hdr) - sizeof(*ns);
1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830
	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
			break;
		}
	}

	/* Ethernet header */
	ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
	ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
	eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
	reply->protocol = htons(ETH_P_IPV6);

	skb_pull(reply, sizeof(struct ethhdr));
1831
	skb_reset_network_header(reply);
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845
	skb_put(reply, sizeof(struct ipv6hdr));

	/* IPv6 header */

	pip6 = ipv6_hdr(reply);
	memset(pip6, 0, sizeof(struct ipv6hdr));
	pip6->version = 6;
	pip6->priority = ipv6_hdr(request)->priority;
	pip6->nexthdr = IPPROTO_ICMPV6;
	pip6->hop_limit = 255;
	pip6->daddr = ipv6_hdr(request)->saddr;
	pip6->saddr = *(struct in6_addr *)n->primary_key;

	skb_pull(reply, sizeof(struct ipv6hdr));
1846
	skb_reset_transport_header(reply);
1847 1848

	/* Neighbor Advertisement */
1849
	na = skb_put_zero(reply, sizeof(*na) + na_olen);
1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871
	na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
	na->icmph.icmp6_router = isrouter;
	na->icmph.icmp6_override = 1;
	na->icmph.icmp6_solicited = 1;
	na->target = ns->target;
	ether_addr_copy(&na->opt[2], n->ha);
	na->opt[0] = ND_OPT_TARGET_LL_ADDR;
	na->opt[1] = na_olen >> 3;

	na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
		&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
		csum_partial(na, sizeof(*na)+na_olen, 0));

	pip6->payload_len = htons(sizeof(*na)+na_olen);

	skb_push(reply, sizeof(struct ipv6hdr));

	reply->ip_summed = CHECKSUM_UNNECESSARY;

	return reply;
}

1872
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
C
Cong Wang 已提交
1873 1874
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
1875
	const struct in6_addr *daddr;
1876
	const struct ipv6hdr *iphdr;
1877
	struct inet6_dev *in6_dev;
1878 1879
	struct neighbour *n;
	struct nd_msg *msg;
C
Cong Wang 已提交
1880 1881 1882 1883 1884 1885 1886

	in6_dev = __in6_dev_get(dev);
	if (!in6_dev)
		goto out;

	iphdr = ipv6_hdr(skb);
	daddr = &iphdr->daddr;
1887
	msg = (struct nd_msg *)(iphdr + 1);
C
Cong Wang 已提交
1888

1889 1890 1891 1892 1893
	if (ipv6_addr_loopback(daddr) ||
	    ipv6_addr_is_multicast(&msg->target))
		goto out;

	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
C
Cong Wang 已提交
1894 1895 1896

	if (n) {
		struct vxlan_fdb *f;
1897
		struct sk_buff *reply;
C
Cong Wang 已提交
1898 1899 1900 1901 1902 1903

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

1904
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
1905 1906 1907 1908 1909 1910
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

1911 1912 1913
		reply = vxlan_na_create(skb, n,
					!!(f ? f->flags & NTF_ROUTER : 0));

C
Cong Wang 已提交
1914
		neigh_release(n);
1915 1916 1917 1918 1919 1920 1921

		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;

1922
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
1923 1924
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
1925
			.sin6.sin6_family = AF_INET6,
1926 1927
		};

C
Cong Wang 已提交
1928 1929 1930 1931 1932 1933 1934 1935 1936
		vxlan_ip_miss(dev, &ipa);
	}

out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}
#endif

D
David Stevens 已提交
1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct neighbour *n;

	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
		return false;

	n = NULL;
	switch (ntohs(eth_hdr(skb)->h_proto)) {
	case ETH_P_IP:
1948 1949 1950
	{
		struct iphdr *pip;

D
David Stevens 已提交
1951 1952 1953 1954
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			return false;
		pip = ip_hdr(skb);
		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
1955
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
C
Cong Wang 已提交
1956 1957
			union vxlan_addr ipa = {
				.sin.sin_addr.s_addr = pip->daddr,
1958
				.sin.sin_family = AF_INET,
C
Cong Wang 已提交
1959 1960 1961 1962 1963 1964
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

D
David Stevens 已提交
1965
		break;
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975
	}
#if IS_ENABLED(CONFIG_IPV6)
	case ETH_P_IPV6:
	{
		struct ipv6hdr *pip6;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			return false;
		pip6 = ipv6_hdr(skb);
		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
1976
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
1977 1978
			union vxlan_addr ipa = {
				.sin6.sin6_addr = pip6->daddr,
1979
				.sin6.sin6_family = AF_INET6,
1980 1981 1982 1983 1984 1985 1986 1987 1988
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

		break;
	}
#endif
D
David Stevens 已提交
1989 1990 1991 1992 1993 1994 1995
	default:
		return false;
	}

	if (n) {
		bool diff;

1996
		diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
D
David Stevens 已提交
1997 1998 1999 2000 2001 2002 2003
		if (diff) {
			memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
				dev->addr_len);
			memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
		}
		neigh_release(n);
		return diff;
C
Cong Wang 已提交
2004 2005
	}

D
David Stevens 已提交
2006 2007 2008
	return false;
}

2009
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
T
Thomas Graf 已提交
2010 2011 2012 2013
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

2014 2015 2016
	if (!md->gbp)
		return;

T
Thomas Graf 已提交
2017
	gbp = (struct vxlanhdr_gbp *)vxh;
2018
	vxh->vx_flags |= VXLAN_HF_GBP;
T
Thomas Graf 已提交
2019 2020 2021 2022 2023 2024 2025 2026 2027 2028

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

J
Jiri Benc 已提交
2029 2030 2031 2032 2033 2034
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
			       __be16 protocol)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

	gpe->np_applied = 1;
2035 2036 2037 2038
	gpe->next_protocol = tun_p_from_eth_p(protocol);
	if (!gpe->next_protocol)
		return -EPFNOSUPPORT;
	return 0;
J
Jiri Benc 已提交
2039 2040
}

2041 2042 2043
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
2044
			   bool udp_sum)
C
Cong Wang 已提交
2045 2046 2047 2048
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
T
Tom Herbert 已提交
2049
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
J
Jiri Benc 已提交
2050
	__be16 inner_protocol = htons(ETH_P_TEB);
T
Tom Herbert 已提交
2051

2052
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
T
Tom Herbert 已提交
2053 2054 2055 2056 2057 2058
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
2059
		     skb->csum_offset == offsetof(struct tcphdr, check)))
T
Tom Herbert 已提交
2060 2061
			type |= SKB_GSO_TUNNEL_REMCSUM;
	}
C
Cong Wang 已提交
2062 2063

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2064
			+ VXLAN_HLEN + iphdr_len;
2065 2066 2067

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
J
Jiri Benc 已提交
2068
	if (unlikely(err))
P
pravin shelar 已提交
2069
		return err;
2070

2071 2072
	err = iptunnel_handle_offloads(skb, type);
	if (err)
P
pravin shelar 已提交
2073
		return err;
2074

2075
	vxh = __skb_push(skb, sizeof(*vxh));
2076 2077
	vxh->vx_flags = VXLAN_HF_VNI;
	vxh->vx_vni = vxlan_vni_field(vni);
2078

T
Tom Herbert 已提交
2079
	if (type & SKB_GSO_TUNNEL_REMCSUM) {
2080
		unsigned int start;
T
Tom Herbert 已提交
2081

2082 2083 2084
		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
		vxh->vx_flags |= VXLAN_HF_RCO;
T
Tom Herbert 已提交
2085 2086 2087 2088 2089 2090 2091

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

2092 2093
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
J
Jiri Benc 已提交
2094 2095 2096
	if (vxflags & VXLAN_F_GPE) {
		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
		if (err < 0)
P
pravin shelar 已提交
2097
			return err;
J
Jiri Benc 已提交
2098 2099
		inner_protocol = skb->protocol;
	}
T
Thomas Graf 已提交
2100

J
Jiri Benc 已提交
2101
	skb_set_inner_protocol(skb, inner_protocol);
2102
	return 0;
2103 2104
}

2105 2106
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
				      struct vxlan_sock *sock4,
2107
				      struct sk_buff *skb, int oif, u8 tos,
2108
				      __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2109
				      struct dst_cache *dst_cache,
2110
				      const struct ip_tunnel_info *info)
2111
{
2112
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2113 2114 2115
	struct rtable *rt = NULL;
	struct flowi4 fl4;

2116 2117 2118
	if (!sock4)
		return ERR_PTR(-EIO);

2119 2120 2121
	if (tos && !info)
		use_cache = false;
	if (use_cache) {
2122 2123 2124 2125 2126
		rt = dst_cache_get_ip4(dst_cache, saddr);
		if (rt)
			return rt;
	}

2127 2128 2129 2130 2131 2132
	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
2133
	fl4.saddr = *saddr;
2134 2135
	fl4.fl4_dport = dport;
	fl4.fl4_sport = sport;
2136 2137

	rt = ip_route_output_key(vxlan->net, &fl4);
2138 2139 2140 2141 2142 2143 2144
	if (likely(!IS_ERR(rt))) {
		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
			ip_rt_put(rt);
			return ERR_PTR(-ELOOP);
		}

2145
		*saddr = fl4.saddr;
2146 2147
		if (use_cache)
			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2148 2149 2150
	} else {
		netdev_dbg(dev, "no route to %pI4\n", &daddr);
		return ERR_PTR(-ENETUNREACH);
2151
	}
2152 2153 2154
	return rt;
}

2155 2156
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2157
					  struct net_device *dev,
2158
					  struct vxlan_sock *sock6,
2159
					  struct sk_buff *skb, int oif, u8 tos,
2160
					  __be32 label,
2161
					  const struct in6_addr *daddr,
2162
					  struct in6_addr *saddr,
2163
					  __be16 dport, __be16 sport,
2164 2165
					  struct dst_cache *dst_cache,
					  const struct ip_tunnel_info *info)
2166
{
2167
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2168 2169 2170 2171
	struct dst_entry *ndst;
	struct flowi6 fl6;
	int err;

2172 2173 2174
	if (!sock6)
		return ERR_PTR(-EIO);

2175 2176
	if (tos && !info)
		use_cache = false;
2177
	if (use_cache) {
2178 2179 2180 2181 2182
		ndst = dst_cache_get_ip6(dst_cache, saddr);
		if (ndst)
			return ndst;
	}

2183 2184 2185
	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
2186
	fl6.saddr = *saddr;
2187
	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2188 2189
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;
2190 2191
	fl6.fl6_dport = dport;
	fl6.fl6_sport = sport;
2192 2193

	err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
2194
					 sock6->sock->sk,
2195
					 &ndst, &fl6);
2196 2197 2198 2199 2200 2201 2202 2203 2204 2205
	if (unlikely(err < 0)) {
		netdev_dbg(dev, "no route to %pI6\n", daddr);
		return ERR_PTR(-ENETUNREACH);
	}

	if (unlikely(ndst->dev == dev)) {
		netdev_dbg(dev, "circular route to %pI6\n", daddr);
		dst_release(ndst);
		return ERR_PTR(-ELOOP);
	}
2206 2207

	*saddr = fl6.saddr;
2208
	if (use_cache)
2209
		dst_cache_set_ip6(dst_cache, ndst, saddr);
2210 2211 2212 2213
	return ndst;
}
#endif

2214 2215
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2216
			       struct vxlan_dev *dst_vxlan, __be32 vni)
2217
{
2218
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
C
Cong Wang 已提交
2219 2220
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2221 2222
	struct net_device *dev = skb->dev;
	int len = skb->len;
2223

2224 2225
	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2226 2227 2228 2229 2230
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

C
Cong Wang 已提交
2231 2232 2233 2234 2235 2236 2237 2238 2239 2240
	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#endif
	}

2241
	if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2242 2243
		vxlan_snoop(skb->dev, &loopback, eth_hdr(skb)->h_source, 0,
			    vni);
2244 2245 2246

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
2247
	tx_stats->tx_bytes += len;
2248 2249 2250 2251 2252
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
2253
		rx_stats->rx_bytes += len;
2254 2255
		u64_stats_update_end(&rx_stats->syncp);
	} else {
2256
		dev->stats.rx_dropped++;
2257 2258 2259
	}
}

2260
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2261 2262 2263 2264
				 struct vxlan_dev *vxlan,
				 union vxlan_addr *daddr,
				 __be16 dst_port, int dst_ifindex, __be32 vni,
				 struct dst_entry *dst,
2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279
				 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
	 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
	 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
	 */
	BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
	/* Bypass encapsulation if the destination is local */
	if (rt_flags & RTCF_LOCAL &&
	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
		struct vxlan_dev *dst_vxlan;

		dst_release(dst);
2280
		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2281
					   daddr->sa.sa_family, dst_port,
2282
					   vxlan->cfg.flags);
2283 2284 2285 2286 2287 2288
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			kfree_skb(skb);

			return -ENOENT;
		}
2289
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2290 2291 2292 2293 2294 2295
		return 1;
	}

	return 0;
}

2296
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2297 2298
			   __be32 default_vni, struct vxlan_rdst *rdst,
			   bool did_rsc)
S
stephen hemminger 已提交
2299
{
2300
	struct dst_cache *dst_cache;
2301
	struct ip_tunnel_info *info;
S
stephen hemminger 已提交
2302
	struct vxlan_dev *vxlan = netdev_priv(dev);
P
pravin shelar 已提交
2303
	const struct iphdr *old_iph = ip_hdr(skb);
C
Cong Wang 已提交
2304
	union vxlan_addr *dst;
2305
	union vxlan_addr remote_ip, local_ip;
T
Thomas Graf 已提交
2306 2307
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
C
Cong Wang 已提交
2308
	__be16 src_port = 0, dst_port;
2309
	struct dst_entry *ndst = NULL;
2310
	__be32 vni, label;
S
stephen hemminger 已提交
2311
	__u8 tos, ttl;
2312
	int ifindex;
2313
	int err;
2314
	u32 flags = vxlan->cfg.flags;
2315
	bool udp_sum = false;
2316
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
S
stephen hemminger 已提交
2317

2318
	info = skb_tunnel_info(skb);
2319

T
Thomas Graf 已提交
2320
	if (rdst) {
P
pravin shelar 已提交
2321 2322 2323 2324
		dst = &rdst->remote_ip;
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
2325
				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
P
pravin shelar 已提交
2326 2327 2328 2329 2330
				return;
			}
			goto drop;
		}

2331
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2332
		vni = (rdst->remote_vni) ? : default_vni;
2333
		ifindex = rdst->remote_ifindex;
2334
		local_ip = vxlan->cfg.saddr;
2335
		dst_cache = &rdst->dst_cache;
P
pravin shelar 已提交
2336
		md->gbp = skb->mark;
H
Hangbin Liu 已提交
2337 2338 2339 2340 2341 2342 2343
		if (flags & VXLAN_F_TTL_INHERIT) {
			ttl = ip_tunnel_get_ttl(old_iph, skb);
		} else {
			ttl = vxlan->cfg.ttl;
			if (!ttl && vxlan_addr_multicast(dst))
				ttl = 1;
		}
P
pravin shelar 已提交
2344 2345 2346 2347 2348 2349 2350 2351 2352 2353

		tos = vxlan->cfg.tos;
		if (tos == 1)
			tos = ip_tunnel_get_dsfield(old_iph, skb);

		if (dst->sa.sa_family == AF_INET)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
		label = vxlan->cfg.label;
T
Thomas Graf 已提交
2354 2355 2356 2357 2358 2359
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
2360
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2361
		if (remote_ip.sa.sa_family == AF_INET) {
2362
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2363 2364
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
2365
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2366 2367
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
T
Thomas Graf 已提交
2368
		dst = &remote_ip;
P
pravin shelar 已提交
2369 2370
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = tunnel_id_to_key32(info->key.tun_id);
2371
		ifindex = 0;
2372
		dst_cache = &info->dst_cache;
2373 2374
		if (info->options_len &&
		    info->key.tun_flags & TUNNEL_VXLAN_OPT)
P
pravin shelar 已提交
2375
			md = ip_tunnel_info_opts(info);
2376 2377
		ttl = info->key.ttl;
		tos = info->key.tos;
2378
		label = info->key.label;
2379
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2380
	}
P
pravin shelar 已提交
2381 2382
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);
2383

J
Jakub Kicinski 已提交
2384
	rcu_read_lock();
C
Cong Wang 已提交
2385
	if (dst->sa.sa_family == AF_INET) {
2386
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
P
pravin shelar 已提交
2387
		struct rtable *rt;
P
pravin shelar 已提交
2388
		__be16 df = 0;
2389

2390 2391 2392
		if (!ifindex)
			ifindex = sock4->sock->sk->sk_bound_dev_if;

2393
		rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2394
				     dst->sin.sin_addr.s_addr,
2395
				     &local_ip.sin.sin_addr.s_addr,
2396
				     dst_port, src_port,
2397
				     dst_cache, info);
2398 2399
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
P
pravin shelar 已提交
2400
			goto tx_error;
2401
		}
C
Cong Wang 已提交
2402

2403
		if (!info) {
2404
			/* Bypass encapsulation if the destination is local */
2405
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2406 2407
						    dst_port, ifindex, vni,
						    &rt->dst, rt->rt_flags);
2408
			if (err)
J
Jakub Kicinski 已提交
2409
				goto out_unlock;
2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420

			if (vxlan->cfg.df == VXLAN_DF_SET) {
				df = htons(IP_DF);
			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
				struct ethhdr *eth = eth_hdr(skb);

				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
				    (ntohs(eth->h_proto) == ETH_P_IP &&
				     old_iph->frag_off & htons(IP_DF)))
					df = htons(IP_DF);
			}
2421
		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2422
			df = htons(IP_DF);
2423
		}
2424

P
pravin shelar 已提交
2425
		ndst = &rt->dst;
2426
		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
X
Xin Long 已提交
2427

C
Cong Wang 已提交
2428 2429
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
P
pravin shelar 已提交
2430
		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2431
				      vni, md, flags, udp_sum);
2432
		if (err < 0)
P
pravin shelar 已提交
2433
			goto tx_error;
2434

2435
		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2436 2437
				    dst->sin.sin_addr.s_addr, tos, ttl, df,
				    src_port, dst_port, xnet, !udp_sum);
C
Cong Wang 已提交
2438 2439
#if IS_ENABLED(CONFIG_IPV6)
	} else {
2440
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
C
Cong Wang 已提交
2441

2442 2443 2444
		if (!ifindex)
			ifindex = sock6->sock->sk->sk_bound_dev_if;

2445
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2446
					label, &dst->sin6.sin6_addr,
2447
					&local_ip.sin6.sin6_addr,
2448
					dst_port, src_port,
2449
					dst_cache, info);
2450
		if (IS_ERR(ndst)) {
2451
			err = PTR_ERR(ndst);
P
pravin shelar 已提交
2452
			ndst = NULL;
2453
			goto tx_error;
C
Cong Wang 已提交
2454
		}
2455

2456 2457
		if (!info) {
			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2458

2459
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2460 2461
						    dst_port, ifindex, vni,
						    ndst, rt6i_flags);
2462
			if (err)
J
Jakub Kicinski 已提交
2463
				goto out_unlock;
2464
		}
2465

2466
		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
X
Xin Long 已提交
2467

2468
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
C
Cong Wang 已提交
2469
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
2470 2471
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2472
				      vni, md, flags, udp_sum);
P
pravin shelar 已提交
2473 2474 2475
		if (err < 0)
			goto tx_error;

P
pravin shelar 已提交
2476
		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2477
				     &local_ip.sin6.sin6_addr,
2478
				     &dst->sin6.sin6_addr, tos, ttl,
2479
				     label, src_port, dst_port, !udp_sum);
C
Cong Wang 已提交
2480 2481
#endif
	}
J
Jakub Kicinski 已提交
2482 2483
out_unlock:
	rcu_read_unlock();
2484
	return;
S
stephen hemminger 已提交
2485 2486 2487

drop:
	dev->stats.tx_dropped++;
P
pravin shelar 已提交
2488 2489
	dev_kfree_skb(skb);
	return;
S
stephen hemminger 已提交
2490 2491

tx_error:
J
Jakub Kicinski 已提交
2492
	rcu_read_unlock();
2493 2494 2495 2496
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
P
pravin shelar 已提交
2497
	dst_release(ndst);
S
stephen hemminger 已提交
2498
	dev->stats.tx_errors++;
P
pravin shelar 已提交
2499
	kfree_skb(skb);
S
stephen hemminger 已提交
2500 2501
}

2502 2503 2504 2505 2506 2507 2508 2509 2510
/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2511
	struct vxlan_rdst *rdst, *fdst = NULL;
2512
	const struct ip_tunnel_info *info;
2513 2514
	bool did_rsc = false;
	struct vxlan_fdb *f;
2515
	struct ethhdr *eth;
2516
	__be32 vni = 0;
2517

2518
	info = skb_tunnel_info(skb);
2519

2520 2521
	skb_reset_mac_header(skb);

2522
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2523 2524 2525 2526 2527 2528 2529 2530 2531 2532
		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
		    info->mode & IP_TUNNEL_INFO_TX) {
			vni = tunnel_id_to_key32(info->key.tun_id);
		} else {
			if (info && info->mode & IP_TUNNEL_INFO_TX)
				vxlan_xmit_one(skb, dev, vni, NULL, false);
			else
				kfree_skb(skb);
			return NETDEV_TX_OK;
		}
2533 2534
	}

2535
	if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2536
		eth = eth_hdr(skb);
C
Cong Wang 已提交
2537
		if (ntohs(eth->h_proto) == ETH_P_ARP)
2538
			return arp_reduce(dev, skb, vni);
C
Cong Wang 已提交
2539
#if IS_ENABLED(CONFIG_IPV6)
2540 2541 2542 2543 2544 2545 2546 2547
		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
			 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
					    sizeof(struct nd_msg)) &&
			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

			if (m->icmph.icmp6_code == 0 &&
			    m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2548
				return neigh_reduce(dev, skb, vni);
C
Cong Wang 已提交
2549 2550 2551
		}
#endif
	}
2552

2553
	eth = eth_hdr(skb);
2554
	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2555 2556
	did_rsc = false;

2557
	if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2558 2559
	    (ntohs(eth->h_proto) == ETH_P_IP ||
	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
2560 2561
		did_rsc = route_shortcircuit(dev, skb);
		if (did_rsc)
2562
			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2563 2564
	}

2565
	if (f == NULL) {
2566
		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2567
		if (f == NULL) {
2568
			if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2569 2570 2571 2572
			    !is_multicast_ether_addr(eth->h_dest))
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
2573
			kfree_skb(skb);
2574 2575 2576
			return NETDEV_TX_OK;
		}
	}
2577

2578 2579
	list_for_each_entry_rcu(rdst, &f->remotes, list) {
		struct sk_buff *skb1;
2580

2581 2582 2583 2584
		if (!fdst) {
			fdst = rdst;
			continue;
		}
2585 2586
		skb1 = skb_clone(skb, GFP_ATOMIC);
		if (skb1)
2587
			vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2588 2589
	}

2590
	if (fdst)
2591
		vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
2592 2593
	else
		kfree_skb(skb);
2594
	return NETDEV_TX_OK;
2595 2596
}

S
stephen hemminger 已提交
2597
/* Walk the forwarding table and purge stale entries */
2598
static void vxlan_cleanup(struct timer_list *t)
S
stephen hemminger 已提交
2599
{
2600
	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
S
stephen hemminger 已提交
2601 2602 2603 2604 2605 2606 2607 2608
	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
	unsigned int h;

	if (!netif_running(vxlan->dev))
		return;

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
2609 2610

		spin_lock_bh(&vxlan->hash_lock);
S
stephen hemminger 已提交
2611 2612 2613 2614 2615
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
			unsigned long timeout;

2616
			if (f->state & (NUD_PERMANENT | NUD_NOARP))
S
stephen hemminger 已提交
2617 2618
				continue;

2619 2620 2621
			if (f->flags & NTF_EXT_LEARNED)
				continue;

2622
			timeout = f->used + vxlan->cfg.age_interval * HZ;
S
stephen hemminger 已提交
2623 2624 2625 2626 2627
			if (time_before_eq(timeout, jiffies)) {
				netdev_dbg(vxlan->dev,
					   "garbage collect %pM\n",
					   f->eth_addr);
				f->state = NUD_STALE;
2628
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
2629 2630 2631
			} else if (time_before(timeout, next_timer))
				next_timer = timeout;
		}
2632
		spin_unlock_bh(&vxlan->hash_lock);
S
stephen hemminger 已提交
2633 2634 2635 2636 2637
	}

	mod_timer(&vxlan->age_timer, next_timer);
}

2638 2639 2640 2641 2642
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2643 2644 2645 2646
	hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
	hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
2647 2648 2649
	spin_unlock(&vn->sock_lock);
}

J
Jiri Benc 已提交
2650 2651
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
			     struct vxlan_dev_node *node)
2652
{
2653
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2654
	__be32 vni = vxlan->default_dst.remote_vni;
2655

J
Jiri Benc 已提交
2656
	node->vxlan = vxlan;
2657
	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2658
	hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
2659
	spin_unlock(&vn->sock_lock);
2660 2661
}

S
stephen hemminger 已提交
2662 2663 2664
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
2665
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2666
	if (!dev->tstats)
S
stephen hemminger 已提交
2667 2668 2669 2670 2671
		return -ENOMEM;

	return 0;
}

2672
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
2673 2674 2675 2676
{
	struct vxlan_fdb *f;

	spin_lock_bh(&vxlan->hash_lock);
2677
	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
2678
	if (f)
2679
		vxlan_fdb_destroy(vxlan, f, true, true);
2680 2681 2682
	spin_unlock_bh(&vxlan->hash_lock);
}

2683 2684 2685 2686
static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

2687
	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
2688

2689 2690 2691
	free_percpu(dev->tstats);
}

S
stephen hemminger 已提交
2692 2693 2694 2695
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2696
	int ret;
2697

2698 2699 2700
	ret = vxlan_sock_add(vxlan);
	if (ret < 0)
		return ret;
S
stephen hemminger 已提交
2701

2702
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
2703
		ret = vxlan_igmp_join(vxlan);
2704 2705
		if (ret == -EADDRINUSE)
			ret = 0;
2706
		if (ret) {
2707
			vxlan_sock_release(vxlan);
2708 2709
			return ret;
		}
S
stephen hemminger 已提交
2710 2711
	}

2712
	if (vxlan->cfg.age_interval)
S
stephen hemminger 已提交
2713 2714
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

2715
	return ret;
S
stephen hemminger 已提交
2716 2717 2718
}

/* Purge the forwarding table */
2719
static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
S
stephen hemminger 已提交
2720
{
2721
	unsigned int h;
S
stephen hemminger 已提交
2722 2723 2724 2725 2726 2727 2728

	spin_lock_bh(&vxlan->hash_lock);
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
2729 2730
			if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
				continue;
2731 2732
			/* the all_zeros_mac entry is deleted at vxlan_uninit */
			if (!is_zero_ether_addr(f->eth_addr))
2733
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
2734 2735 2736 2737 2738 2739 2740 2741 2742
		}
	}
	spin_unlock_bh(&vxlan->hash_lock);
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
N
Nicolas Dichtel 已提交
2743
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2744
	int ret = 0;
S
stephen hemminger 已提交
2745

2746
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
2747
	    !vxlan_group_used(vn, vxlan))
2748
		ret = vxlan_igmp_leave(vxlan);
S
stephen hemminger 已提交
2749 2750 2751

	del_timer_sync(&vxlan->age_timer);

2752
	vxlan_flush(vxlan, false);
2753
	vxlan_sock_release(vxlan);
S
stephen hemminger 已提交
2754

2755
	return ret;
S
stephen hemminger 已提交
2756 2757 2758 2759 2760 2761 2762
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

2763
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
2764
{
2765 2766 2767 2768
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);
2769
	bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
2770

2771 2772 2773 2774 2775 2776 2777
	/* This check is different than dev->max_mtu, because it looks at
	 * the lowerdev->mtu, rather than the static dev->max_mtu
	 */
	if (lowerdev) {
		int max_mtu = lowerdev->mtu -
			      (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
		if (new_mtu > max_mtu)
D
David Wragg 已提交
2778 2779 2780
			return -EINVAL;
	}

2781 2782 2783 2784
	dev->mtu = new_mtu;
	return 0;
}

2785 2786 2787 2788 2789 2790 2791 2792 2793 2794
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

2795
	if (ip_tunnel_info_af(info) == AF_INET) {
2796
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
2797 2798
		struct rtable *rt;

2799
		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
2800
				     info->key.u.ipv4.dst,
2801 2802
				     &info->key.u.ipv4.src, dport, sport,
				     &info->dst_cache, info);
2803 2804 2805
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
2806 2807
	} else {
#if IS_ENABLED(CONFIG_IPV6)
2808
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
2809 2810
		struct dst_entry *ndst;

2811
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
2812
					info->key.label, &info->key.u.ipv6.dst,
2813 2814
					&info->key.u.ipv6.src, dport, sport,
					&info->dst_cache, info);
2815 2816 2817 2818 2819 2820 2821
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
		return -EPFNOSUPPORT;
#endif
	}
2822 2823
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
2824
	return 0;
2825 2826
}

2827
static const struct net_device_ops vxlan_netdev_ether_ops = {
S
stephen hemminger 已提交
2828
	.ndo_init		= vxlan_init,
2829
	.ndo_uninit		= vxlan_uninit,
S
stephen hemminger 已提交
2830 2831 2832
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
2833
	.ndo_get_stats64	= ip_tunnel_get_stats64,
S
stephen hemminger 已提交
2834
	.ndo_set_rx_mode	= vxlan_set_multicast_list,
2835
	.ndo_change_mtu		= vxlan_change_mtu,
S
stephen hemminger 已提交
2836 2837 2838 2839 2840
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_fdb_add		= vxlan_fdb_add,
	.ndo_fdb_del		= vxlan_fdb_delete,
	.ndo_fdb_dump		= vxlan_fdb_dump,
R
Roopa Prabhu 已提交
2841
	.ndo_fdb_get		= vxlan_fdb_get,
2842
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
S
stephen hemminger 已提交
2843 2844
};

J
Jiri Benc 已提交
2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855
static const struct net_device_ops vxlan_netdev_raw_ops = {
	.ndo_init		= vxlan_init,
	.ndo_uninit		= vxlan_uninit,
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
	.ndo_change_mtu		= vxlan_change_mtu,
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
};

S
stephen hemminger 已提交
2856 2857 2858 2859 2860
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
	.name = "vxlan",
};

2861
/* Calls the ndo_udp_tunnel_add of the caller in order to
J
Joseph Gasparakis 已提交
2862
 * supply the listening VXLAN udp ports. Callers are expected
2863
 * to implement the ndo_udp_tunnel_add.
2864
 */
2865
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
2866 2867 2868 2869
{
	struct vxlan_sock *vs;
	struct net *net = dev_net(dev);
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
J
Joseph Gasparakis 已提交
2870
	unsigned int i;
2871 2872 2873

	spin_lock(&vn->sock_lock);
	for (i = 0; i < PORT_HASH_SIZE; ++i) {
2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
			unsigned short type;

			if (vs->flags & VXLAN_F_GPE)
				type = UDP_TUNNEL_TYPE_VXLAN_GPE;
			else
				type = UDP_TUNNEL_TYPE_VXLAN;

			if (push)
				udp_tunnel_push_rx_port(dev, vs->sock, type);
			else
				udp_tunnel_drop_rx_port(dev, vs->sock, type);
		}
2887 2888 2889 2890
	}
	spin_unlock(&vn->sock_lock);
}

S
stephen hemminger 已提交
2891 2892 2893 2894
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2895
	unsigned int h;
S
stephen hemminger 已提交
2896

2897 2898 2899
	eth_hw_addr_random(dev);
	ether_setup(dev);

2900
	dev->needs_free_netdev = true;
S
stephen hemminger 已提交
2901 2902 2903
	SET_NETDEV_DEVTYPE(dev, &vxlan_type);

	dev->features	|= NETIF_F_LLTX;
2904
	dev->features	|= NETIF_F_SG | NETIF_F_HW_CSUM;
2905
	dev->features   |= NETIF_F_RXCSUM;
2906
	dev->features   |= NETIF_F_GSO_SOFTWARE;
2907

2908
	dev->vlan_features = dev->features;
2909
	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
2910
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
2911
	netif_keep_dst(dev);
2912
	dev->priv_flags |= IFF_NO_QUEUE;
S
stephen hemminger 已提交
2913

2914 2915 2916 2917
	/* MTU range: 68 - 65535 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = ETH_MAX_MTU;

2918
	INIT_LIST_HEAD(&vxlan->next);
S
stephen hemminger 已提交
2919 2920
	spin_lock_init(&vxlan->hash_lock);

2921
	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
S
stephen hemminger 已提交
2922 2923 2924

	vxlan->dev = dev;

T
Tom Herbert 已提交
2925 2926
	gro_cells_init(&vxlan->gro_cells, dev);

S
stephen hemminger 已提交
2927 2928 2929 2930
	for (h = 0; h < FDB_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
}

2931 2932 2933 2934 2935 2936 2937
static void vxlan_ether_setup(struct net_device *dev)
{
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
	dev->netdev_ops = &vxlan_netdev_ether_ops;
}

J
Jiri Benc 已提交
2938 2939
static void vxlan_raw_setup(struct net_device *dev)
{
2940
	dev->header_ops = NULL;
J
Jiri Benc 已提交
2941 2942 2943 2944 2945 2946 2947
	dev->type = ARPHRD_NONE;
	dev->hard_header_len = 0;
	dev->addr_len = 0;
	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
	dev->netdev_ops = &vxlan_netdev_raw_ops;
}

S
stephen hemminger 已提交
2948 2949
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
2950
	[IFLA_VXLAN_GROUP]	= { .len = FIELD_SIZEOF(struct iphdr, daddr) },
C
Cong Wang 已提交
2951
	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
2952 2953
	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LOCAL]	= { .len = FIELD_SIZEOF(struct iphdr, saddr) },
C
Cong Wang 已提交
2954
	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
2955 2956
	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
2957
	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
2958 2959 2960
	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
2961
	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
D
David Stevens 已提交
2962 2963 2964 2965
	[IFLA_VXLAN_PROXY]	= { .type = NLA_U8 },
	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
2966
	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
2967
	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
2968 2969 2970
	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
T
Tom Herbert 已提交
2971 2972
	[IFLA_VXLAN_REMCSUM_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_REMCSUM_RX]	= { .type = NLA_U8 },
T
Thomas Graf 已提交
2973
	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
J
Jiri Benc 已提交
2974
	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
2975
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
H
Hangbin Liu 已提交
2976
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
2977
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
S
stephen hemminger 已提交
2978 2979
};

2980 2981
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
2982 2983 2984
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
2985 2986
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided link layer address is not Ethernet");
S
stephen hemminger 已提交
2987 2988 2989 2990
			return -EINVAL;
		}

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
2991 2992
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided Ethernet address is not unicast");
S
stephen hemminger 已提交
2993 2994 2995 2996
			return -EADDRNOTAVAIL;
		}
	}

2997
	if (tb[IFLA_MTU]) {
2998
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
2999

3000 3001 3002
		if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "MTU must be between 68 and 65535");
3003
			return -EINVAL;
3004
		}
3005 3006
	}

3007 3008 3009
	if (!data) {
		NL_SET_ERR_MSG(extack,
			       "Required attributes not provided to perform the operation");
S
stephen hemminger 已提交
3010
		return -EINVAL;
3011
	}
S
stephen hemminger 已提交
3012 3013

	if (data[IFLA_VXLAN_ID]) {
3014 3015
		u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

3016 3017 3018
		if (id >= VXLAN_N_VID) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID],
					    "VXLAN ID must be lower than 16777216");
S
stephen hemminger 已提交
3019
			return -ERANGE;
3020
		}
S
stephen hemminger 已提交
3021 3022
	}

3023 3024 3025 3026 3027
	if (data[IFLA_VXLAN_PORT_RANGE]) {
		const struct ifla_vxlan_port_range *p
			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);

		if (ntohs(p->high) < ntohs(p->low)) {
3028 3029
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
					    "Invalid source port range");
3030 3031 3032 3033
			return -EINVAL;
		}
	}

3034 3035 3036 3037 3038 3039 3040 3041 3042 3043
	if (data[IFLA_VXLAN_DF]) {
		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

		if (df < 0 || df > VXLAN_DF_MAX) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_DF],
					    "Invalid DF attribute");
			return -EINVAL;
		}
	}

S
stephen hemminger 已提交
3044 3045 3046
	return 0;
}

Y
Yan Burman 已提交
3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058
static void vxlan_get_drvinfo(struct net_device *netdev,
			      struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

static const struct ethtool_ops vxlan_ethtool_ops = {
	.get_drvinfo	= vxlan_get_drvinfo,
	.get_link	= ethtool_op_get_link,
};

T
Tom Herbert 已提交
3059
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3060
					__be16 port, u32 flags, int ifindex)
3061
{
C
Cong Wang 已提交
3062
	struct socket *sock;
T
Tom Herbert 已提交
3063 3064
	struct udp_port_cfg udp_conf;
	int err;
C
Cong Wang 已提交
3065

T
Tom Herbert 已提交
3066
	memset(&udp_conf, 0, sizeof(udp_conf));
C
Cong Wang 已提交
3067

T
Tom Herbert 已提交
3068 3069 3070
	if (ipv6) {
		udp_conf.family = AF_INET6;
		udp_conf.use_udp6_rx_checksums =
3071
		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3072
		udp_conf.ipv6_v6only = 1;
T
Tom Herbert 已提交
3073 3074
	} else {
		udp_conf.family = AF_INET;
C
Cong Wang 已提交
3075 3076
	}

T
Tom Herbert 已提交
3077
	udp_conf.local_udp_port = port;
3078
	udp_conf.bind_ifindex = ifindex;
3079

T
Tom Herbert 已提交
3080 3081 3082 3083
	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);
C
Cong Wang 已提交
3084

Z
Zhi Yong Wu 已提交
3085
	return sock;
C
Cong Wang 已提交
3086 3087 3088
}

/* Create new listen socket if needed */
3089
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3090 3091
					      __be16 port, u32 flags,
					      int ifindex)
C
Cong Wang 已提交
3092 3093 3094 3095 3096
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_sock *vs;
	struct socket *sock;
	unsigned int h;
3097
	struct udp_tunnel_sock_cfg tunnel_cfg;
C
Cong Wang 已提交
3098

3099
	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
C
Cong Wang 已提交
3100 3101 3102 3103 3104 3105
	if (!vs)
		return ERR_PTR(-ENOMEM);

	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vs->vni_list[h]);

3106
	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
Z
Zhi Yong Wu 已提交
3107
	if (IS_ERR(sock)) {
3108
		kfree(vs);
3109
		return ERR_CAST(sock);
3110
	}
C
Cong Wang 已提交
3111 3112

	vs->sock = sock;
3113
	refcount_set(&vs->refcnt, 1);
3114
	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3115

3116 3117
	spin_lock(&vn->sock_lock);
	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3118
	udp_tunnel_notify_add_rx_port(sock,
3119 3120
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
3121
				      UDP_TUNNEL_TYPE_VXLAN);
3122
	spin_unlock(&vn->sock_lock);
3123 3124

	/* Mark socket as an encapsulation socket. */
3125
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3126 3127
	tunnel_cfg.sk_user_data = vs;
	tunnel_cfg.encap_type = 1;
3128
	tunnel_cfg.encap_rcv = vxlan_rcv;
S
Stefano Brivio 已提交
3129
	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3130
	tunnel_cfg.encap_destroy = NULL;
3131 3132
	tunnel_cfg.gro_receive = vxlan_gro_receive;
	tunnel_cfg.gro_complete = vxlan_gro_complete;
3133 3134

	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
C
Cong Wang 已提交
3135

3136 3137 3138
	return vs;
}

3139
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3140
{
3141 3142
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	struct vxlan_sock *vs = NULL;
J
Jiri Benc 已提交
3143
	struct vxlan_dev_node *node;
3144 3145 3146 3147 3148
	int l3mdev_index = 0;

	if (vxlan->cfg.remote_ifindex)
		l3mdev_index = l3mdev_master_upper_ifindex_by_index(
			vxlan->net, vxlan->cfg.remote_ifindex);
3149

3150
	if (!vxlan->cfg.no_share) {
3151
		spin_lock(&vn->sock_lock);
3152
		vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3153 3154
				     vxlan->cfg.dst_port, vxlan->cfg.flags,
				     l3mdev_index);
3155
		if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3156
			spin_unlock(&vn->sock_lock);
3157
			return -EBUSY;
3158 3159 3160
		}
		spin_unlock(&vn->sock_lock);
	}
3161
	if (!vs)
3162
		vs = vxlan_socket_create(vxlan->net, ipv6,
3163 3164
					 vxlan->cfg.dst_port, vxlan->cfg.flags,
					 l3mdev_index);
3165 3166
	if (IS_ERR(vs))
		return PTR_ERR(vs);
3167
#if IS_ENABLED(CONFIG_IPV6)
J
Jiri Benc 已提交
3168
	if (ipv6) {
3169
		rcu_assign_pointer(vxlan->vn6_sock, vs);
J
Jiri Benc 已提交
3170 3171
		node = &vxlan->hlist6;
	} else
3172
#endif
J
Jiri Benc 已提交
3173
	{
3174
		rcu_assign_pointer(vxlan->vn4_sock, vs);
J
Jiri Benc 已提交
3175 3176 3177
		node = &vxlan->hlist4;
	}
	vxlan_vs_add_dev(vs, vxlan, node);
3178
	return 0;
3179 3180
}

3181 3182
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
3183 3184
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3185
	bool ipv4 = !ipv6 || metadata;
3186 3187
	int ret = 0;

3188
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3189
#if IS_ENABLED(CONFIG_IPV6)
3190
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3191
	if (ipv6) {
3192
		ret = __vxlan_sock_add(vxlan, true);
3193 3194 3195
		if (ret < 0 && ret != -EAFNOSUPPORT)
			ipv4 = false;
	}
3196
#endif
3197
	if (ipv4)
3198 3199 3200 3201 3202 3203
		ret = __vxlan_sock_add(vxlan, false);
	if (ret < 0)
		vxlan_sock_release(vxlan);
	return ret;
}

3204 3205
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
3206 3207
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3208
{
3209
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3210
	struct vxlan_dev *tmp;
C
Cong Wang 已提交
3211
	bool use_ipv6 = false;
S
stephen hemminger 已提交
3212

3213 3214 3215 3216 3217 3218 3219 3220
	if (conf->flags & VXLAN_F_GPE) {
		/* For now, allow GPE only together with
		 * COLLECT_METADATA. This can be relaxed later; in such
		 * case, the other side of the PtP link will have to be
		 * provided.
		 */
		if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3221 3222
			NL_SET_ERR_MSG(extack,
				       "VXLAN GPE does not support this combination of attributes");
3223
			return -EINVAL;
3224
		}
J
Jiri Benc 已提交
3225
	}
3226

3227 3228
	if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
		/* Unless IPv6 is explicitly requested, assume IPv4 */
3229
		conf->remote_ip.sa.sa_family = AF_INET;
3230 3231 3232 3233 3234 3235 3236
		conf->saddr.sa.sa_family = AF_INET;
	} else if (!conf->remote_ip.sa.sa_family) {
		conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
	} else if (!conf->saddr.sa.sa_family) {
		conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
	}

3237 3238 3239
	if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
		NL_SET_ERR_MSG(extack,
			       "Local and remote address must be from the same family");
3240
		return -EINVAL;
3241
	}
C
Cong Wang 已提交
3242

3243 3244
	if (vxlan_addr_multicast(&conf->saddr)) {
		NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3245
		return -EINVAL;
3246
	}
3247

3248
	if (conf->saddr.sa.sa_family == AF_INET6) {
3249 3250 3251
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG(extack,
				       "IPv6 support not enabled in the kernel");
3252
			return -EPFNOSUPPORT;
3253
		}
C
Cong Wang 已提交
3254
		use_ipv6 = true;
3255
		conf->flags |= VXLAN_F_IPV6;
3256 3257 3258 3259 3260 3261 3262 3263 3264

		if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			int local_type =
				ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
			int remote_type =
				ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

			if (local_type & IPV6_ADDR_LINKLOCAL) {
				if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3265 3266 3267
				    (remote_type != IPV6_ADDR_ANY)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3268
					return -EINVAL;
3269
				}
3270 3271 3272 3273

				conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
			} else {
				if (remote_type ==
3274 3275 3276
				    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3277
					return -EINVAL;
3278
				}
3279 3280 3281 3282

				conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
			}
		}
3283
	}
S
stephen hemminger 已提交
3284

3285 3286 3287
	if (conf->label && !use_ipv6) {
		NL_SET_ERR_MSG(extack,
			       "Label attribute only applies to IPv6 VXLAN devices");
3288
		return -EINVAL;
3289
	}
3290

3291 3292
	if (conf->remote_ifindex) {
		struct net_device *lowerdev;
3293

3294
		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3295 3296 3297
		if (!lowerdev) {
			NL_SET_ERR_MSG(extack,
				       "Invalid local interface, device not found");
3298
			return -ENODEV;
3299
		}
S
stephen hemminger 已提交
3300

C
Cong Wang 已提交
3301 3302 3303
#if IS_ENABLED(CONFIG_IPV6)
		if (use_ipv6) {
			struct inet6_dev *idev = __in6_dev_get(lowerdev);
3304 3305 3306
			if (idev && idev->cnf.disable_ipv6) {
				NL_SET_ERR_MSG(extack,
					       "IPv6 support disabled by administrator");
C
Cong Wang 已提交
3307
				return -EPERM;
3308
			}
C
Cong Wang 已提交
3309 3310 3311
		}
#endif

3312 3313
		*lower = lowerdev;
	} else {
3314 3315 3316 3317
		if (vxlan_addr_multicast(&conf->remote_ip)) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for multicast remote destination");

3318
			return -EINVAL;
3319
		}
3320

3321
#if IS_ENABLED(CONFIG_IPV6)
3322 3323 3324
		if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for link-local local/remote addresses");
3325
			return -EINVAL;
3326
		}
3327 3328
#endif

3329
		*lower = NULL;
J
Jiri Benc 已提交
3330
	}
S
stephen hemminger 已提交
3331

3332 3333 3334 3335 3336
	if (!conf->dst_port) {
		if (conf->flags & VXLAN_F_GPE)
			conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
		else
			conf->dst_port = htons(vxlan_port);
3337 3338
	}

3339 3340
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;
3341

3342 3343 3344
	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;
3345

3346 3347 3348 3349 3350
		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3351
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3352 3353 3354 3355 3356 3357
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

3358 3359
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
3360
		return -EEXIST;
3361
	}
3362

3363 3364 3365 3366 3367
	return 0;
}

static void vxlan_config_apply(struct net_device *dev,
			       struct vxlan_config *conf,
3368 3369 3370
			       struct net_device *lowerdev,
			       struct net *src_net,
			       bool changelink)
3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	unsigned short needed_headroom = ETH_HLEN;
	bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
	int max_mtu = ETH_MAX_MTU;

	if (!changelink) {
		if (conf->flags & VXLAN_F_GPE)
			vxlan_raw_setup(dev);
		else
			vxlan_ether_setup(dev);

		if (conf->mtu)
			dev->mtu = conf->mtu;
3386 3387

		vxlan->net = src_net;
3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398
	}

	dst->remote_vni = conf->vni;

	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

	if (lowerdev) {
		dst->remote_ifindex = conf->remote_ifindex;

		dev->gso_max_size = lowerdev->gso_max_size;
		dev->gso_max_segs = lowerdev->gso_max_segs;
3399

3400
		needed_headroom = lowerdev->hard_header_len;
3401

3402 3403
		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
					   VXLAN_HEADROOM);
3404 3405 3406 3407 3408
		if (max_mtu < ETH_MIN_MTU)
			max_mtu = ETH_MIN_MTU;

		if (!changelink && !conf->mtu)
			dev->mtu = max_mtu;
3409 3410
	}

3411 3412 3413
	if (dev->mtu > max_mtu)
		dev->mtu = max_mtu;

3414 3415 3416 3417 3418 3419
	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
		needed_headroom += VXLAN6_HEADROOM;
	else
		needed_headroom += VXLAN_HEADROOM;
	dev->needed_headroom = needed_headroom;

3420
	memcpy(&vxlan->cfg, conf, sizeof(*conf));
3421
}
3422

3423
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3424 3425
			       struct vxlan_config *conf, bool changelink,
			       struct netlink_ext_ack *extack)
3426 3427 3428 3429
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct net_device *lowerdev;
	int ret;
3430

3431
	ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3432 3433
	if (ret)
		return ret;
R
Roopa Prabhu 已提交
3434

3435
	vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3436 3437 3438 3439

	return 0;
}

N
Nicolas Dichtel 已提交
3440
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3441 3442
			      struct vxlan_config *conf,
			      struct netlink_ext_ack *extack)
N
Nicolas Dichtel 已提交
3443 3444 3445
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan = netdev_priv(dev);
3446
	struct vxlan_fdb *f = NULL;
N
Nicolas Dichtel 已提交
3447 3448
	int err;

3449
	err = vxlan_dev_configure(net, dev, conf, false, extack);
N
Nicolas Dichtel 已提交
3450 3451 3452 3453 3454 3455 3456
	if (err)
		return err;

	dev->ethtool_ops = &vxlan_ethtool_ops;

	/* create an fdb entry for a valid default destination */
	if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
3457
		err = vxlan_fdb_create(vxlan, all_zeros_mac,
N
Nicolas Dichtel 已提交
3458 3459 3460 3461 3462 3463
				       &vxlan->default_dst.remote_ip,
				       NUD_REACHABLE | NUD_PERMANENT,
				       vxlan->cfg.dst_port,
				       vxlan->default_dst.remote_vni,
				       vxlan->default_dst.remote_vni,
				       vxlan->default_dst.remote_ifindex,
3464
				       NTF_SELF, &f);
N
Nicolas Dichtel 已提交
3465 3466 3467 3468 3469
		if (err)
			return err;
	}

	err = register_netdevice(dev);
3470 3471 3472 3473
	if (err)
		goto errout;

	err = rtnl_configure_link(dev, NULL);
N
Nicolas Dichtel 已提交
3474
	if (err) {
3475 3476
		unregister_netdevice(dev);
		goto errout;
N
Nicolas Dichtel 已提交
3477 3478
	}

3479 3480
	/* notify default fdb entry */
	if (f)
3481 3482
		vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
				 true);
3483

N
Nicolas Dichtel 已提交
3484 3485
	list_add(&vxlan->next, &vn->vxlan_list);
	return 0;
3486 3487
errout:
	if (f)
3488
		vxlan_fdb_destroy(vxlan, f, false, false);
3489
	return err;
N
Nicolas Dichtel 已提交
3490 3491
}

R
Roopa Prabhu 已提交
3492 3493 3494
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
			 struct net_device *dev, struct vxlan_config *conf,
			 bool changelink)
3495
{
R
Roopa Prabhu 已提交
3496
	struct vxlan_dev *vxlan = netdev_priv(dev);
3497

R
Roopa Prabhu 已提交
3498
	memset(conf, 0, sizeof(*conf));
3499

R
Roopa Prabhu 已提交
3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510
	/* if changelink operation, start with old existing cfg */
	if (changelink)
		memcpy(conf, &vxlan->cfg, sizeof(*conf));

	if (data[IFLA_VXLAN_ID]) {
		__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

		if (changelink && (vni != conf->vni))
			return -EOPNOTSUPP;
		conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
	}
3511 3512

	if (data[IFLA_VXLAN_GROUP]) {
3513 3514 3515
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET))
			return -EOPNOTSUPP;

R
Roopa Prabhu 已提交
3516
		conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3517
		conf->remote_ip.sa.sa_family = AF_INET;
3518 3519 3520 3521
	} else if (data[IFLA_VXLAN_GROUP6]) {
		if (!IS_ENABLED(CONFIG_IPV6))
			return -EPFNOSUPPORT;

3522 3523 3524
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6))
			return -EOPNOTSUPP;

R
Roopa Prabhu 已提交
3525 3526
		conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
		conf->remote_ip.sa.sa_family = AF_INET6;
3527 3528 3529
	}

	if (data[IFLA_VXLAN_LOCAL]) {
3530 3531 3532
		if (changelink && (conf->saddr.sa.sa_family != AF_INET))
			return -EOPNOTSUPP;

R
Roopa Prabhu 已提交
3533 3534
		conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
		conf->saddr.sa.sa_family = AF_INET;
3535 3536 3537 3538
	} else if (data[IFLA_VXLAN_LOCAL6]) {
		if (!IS_ENABLED(CONFIG_IPV6))
			return -EPFNOSUPPORT;

3539 3540 3541
		if (changelink && (conf->saddr.sa.sa_family != AF_INET6))
			return -EOPNOTSUPP;

3542
		/* TODO: respect scope id */
R
Roopa Prabhu 已提交
3543 3544
		conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
		conf->saddr.sa.sa_family = AF_INET6;
3545 3546 3547
	}

	if (data[IFLA_VXLAN_LINK])
R
Roopa Prabhu 已提交
3548
		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
3549

S
stephen hemminger 已提交
3550
	if (data[IFLA_VXLAN_TOS])
R
Roopa Prabhu 已提交
3551
		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
S
stephen hemminger 已提交
3552

3553
	if (data[IFLA_VXLAN_TTL])
R
Roopa Prabhu 已提交
3554
		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
3555

H
Hangbin Liu 已提交
3556 3557 3558 3559 3560 3561
	if (data[IFLA_VXLAN_TTL_INHERIT]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->flags |= VXLAN_F_TTL_INHERIT;
	}

3562
	if (data[IFLA_VXLAN_LABEL])
R
Roopa Prabhu 已提交
3563
		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
3564 3565
			     IPV6_FLOWLABEL_MASK;

R
Roopa Prabhu 已提交
3566
	if (data[IFLA_VXLAN_LEARNING]) {
3567
		if (nla_get_u8(data[IFLA_VXLAN_LEARNING]))
R
Roopa Prabhu 已提交
3568
			conf->flags |= VXLAN_F_LEARN;
3569
		else
R
Roopa Prabhu 已提交
3570 3571 3572 3573 3574
			conf->flags &= ~VXLAN_F_LEARN;
	} else if (!changelink) {
		/* default to learn on a new device */
		conf->flags |= VXLAN_F_LEARN;
	}
S
stephen hemminger 已提交
3575

I
Ido Schimmel 已提交
3576
	if (data[IFLA_VXLAN_AGEING])
R
Roopa Prabhu 已提交
3577
		conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
S
stephen hemminger 已提交
3578

R
Roopa Prabhu 已提交
3579 3580 3581 3582 3583 3584
	if (data[IFLA_VXLAN_PROXY]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_PROXY]))
			conf->flags |= VXLAN_F_PROXY;
	}
D
David Stevens 已提交
3585

R
Roopa Prabhu 已提交
3586 3587 3588 3589 3590 3591
	if (data[IFLA_VXLAN_RSC]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_RSC]))
			conf->flags |= VXLAN_F_RSC;
	}
D
David Stevens 已提交
3592

R
Roopa Prabhu 已提交
3593 3594 3595 3596 3597 3598
	if (data[IFLA_VXLAN_L2MISS]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_L2MISS]))
			conf->flags |= VXLAN_F_L2MISS;
	}
D
David Stevens 已提交
3599

R
Roopa Prabhu 已提交
3600 3601 3602 3603 3604 3605
	if (data[IFLA_VXLAN_L3MISS]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_L3MISS]))
			conf->flags |= VXLAN_F_L3MISS;
	}
D
David Stevens 已提交
3606

R
Roopa Prabhu 已提交
3607 3608 3609 3610 3611
	if (data[IFLA_VXLAN_LIMIT]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
	}
S
stephen hemminger 已提交
3612

R
Roopa Prabhu 已提交
3613 3614 3615 3616 3617 3618
	if (data[IFLA_VXLAN_COLLECT_METADATA]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_COLLECT_METADATA]))
			conf->flags |= VXLAN_F_COLLECT_METADATA;
	}
3619

3620
	if (data[IFLA_VXLAN_PORT_RANGE]) {
R
Roopa Prabhu 已提交
3621 3622 3623 3624 3625 3626 3627 3628
		if (!changelink) {
			const struct ifla_vxlan_port_range *p
				= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
			conf->port_min = ntohs(p->low);
			conf->port_max = ntohs(p->high);
		} else {
			return -EOPNOTSUPP;
		}
3629 3630
	}

R
Roopa Prabhu 已提交
3631 3632 3633 3634 3635
	if (data[IFLA_VXLAN_PORT]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
	}
3636

R
Roopa Prabhu 已提交
3637 3638 3639 3640 3641 3642
	if (data[IFLA_VXLAN_UDP_CSUM]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
	}
3643

R
Roopa Prabhu 已提交
3644 3645 3646 3647 3648 3649
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_TX;
	}
3650

R
Roopa Prabhu 已提交
3651 3652 3653 3654 3655 3656
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM6_RX;
	}
3657

R
Roopa Prabhu 已提交
3658 3659 3660 3661 3662 3663
	if (data[IFLA_VXLAN_REMCSUM_TX]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_TX]))
			conf->flags |= VXLAN_F_REMCSUM_TX;
	}
T
Tom Herbert 已提交
3664

R
Roopa Prabhu 已提交
3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695
	if (data[IFLA_VXLAN_REMCSUM_RX]) {
		if (changelink)
			return -EOPNOTSUPP;
		if (nla_get_u8(data[IFLA_VXLAN_REMCSUM_RX]))
			conf->flags |= VXLAN_F_REMCSUM_RX;
	}

	if (data[IFLA_VXLAN_GBP]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->flags |= VXLAN_F_GBP;
	}

	if (data[IFLA_VXLAN_GPE]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->flags |= VXLAN_F_GPE;
	}

	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->flags |= VXLAN_F_REMCSUM_NOPARTIAL;
	}

	if (tb[IFLA_MTU]) {
		if (changelink)
			return -EOPNOTSUPP;
		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
	}

3696 3697 3698
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

R
Roopa Prabhu 已提交
3699 3700 3701 3702
	return 0;
}

static int vxlan_newlink(struct net *src_net, struct net_device *dev,
3703 3704
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
3705 3706 3707 3708 3709 3710 3711 3712
{
	struct vxlan_config conf;
	int err;

	err = vxlan_nl2conf(tb, data, dev, &conf, false);
	if (err)
		return err;

3713
	return __vxlan_dev_create(src_net, dev, &conf, extack);
R
Roopa Prabhu 已提交
3714
}
T
Tom Herbert 已提交
3715

R
Roopa Prabhu 已提交
3716
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
3717 3718
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
3719 3720 3721
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
I
Ido Schimmel 已提交
3722
	unsigned long old_age_interval;
R
Roopa Prabhu 已提交
3723 3724
	struct vxlan_rdst old_dst;
	struct vxlan_config conf;
3725
	struct vxlan_fdb *f = NULL;
R
Roopa Prabhu 已提交
3726 3727 3728 3729 3730 3731
	int err;

	err = vxlan_nl2conf(tb, data,
			    dev, &conf, true);
	if (err)
		return err;
T
Thomas Graf 已提交
3732

I
Ido Schimmel 已提交
3733
	old_age_interval = vxlan->cfg.age_interval;
R
Roopa Prabhu 已提交
3734
	memcpy(&old_dst, dst, sizeof(struct vxlan_rdst));
J
Jiri Benc 已提交
3735

3736
	err = vxlan_dev_configure(vxlan->net, dev, &conf, true, extack);
R
Roopa Prabhu 已提交
3737 3738
	if (err)
		return err;
3739

I
Ido Schimmel 已提交
3740 3741 3742
	if (old_age_interval != vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies);

R
Roopa Prabhu 已提交
3743 3744 3745 3746 3747 3748 3749 3750 3751
	/* handle default dst entry */
	if (!vxlan_addr_equal(&dst->remote_ip, &old_dst.remote_ip)) {
		spin_lock_bh(&vxlan->hash_lock);
		if (!vxlan_addr_any(&old_dst.remote_ip))
			__vxlan_fdb_delete(vxlan, all_zeros_mac,
					   old_dst.remote_ip,
					   vxlan->cfg.dst_port,
					   old_dst.remote_vni,
					   old_dst.remote_vni,
3752 3753
					   old_dst.remote_ifindex,
					   true);
R
Roopa Prabhu 已提交
3754 3755

		if (!vxlan_addr_any(&dst->remote_ip)) {
3756
			err = vxlan_fdb_create(vxlan, all_zeros_mac,
R
Roopa Prabhu 已提交
3757 3758 3759 3760 3761 3762
					       &dst->remote_ip,
					       NUD_REACHABLE | NUD_PERMANENT,
					       vxlan->cfg.dst_port,
					       dst->remote_vni,
					       dst->remote_vni,
					       dst->remote_ifindex,
3763
					       NTF_SELF, &f);
R
Roopa Prabhu 已提交
3764 3765 3766 3767
			if (err) {
				spin_unlock_bh(&vxlan->hash_lock);
				return err;
			}
3768 3769
			vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
					 RTM_NEWNEIGH, true);
R
Roopa Prabhu 已提交
3770 3771 3772
		}
		spin_unlock_bh(&vxlan->hash_lock);
	}
3773

R
Roopa Prabhu 已提交
3774
	return 0;
S
stephen hemminger 已提交
3775 3776 3777 3778 3779 3780
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

3781 3782
	vxlan_flush(vxlan, true);

T
Tom Herbert 已提交
3783
	gro_cells_destroy(&vxlan->gro_cells);
3784
	list_del(&vxlan->next);
S
stephen hemminger 已提交
3785 3786 3787 3788 3789 3790 3791
	unregister_netdevice_queue(dev, head);
}

static size_t vxlan_get_size(const struct net_device *dev)
{

	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
C
Cong Wang 已提交
3792
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
S
stephen hemminger 已提交
3793
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
C
Cong Wang 已提交
3794
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
S
stephen hemminger 已提交
3795
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
H
Hangbin Liu 已提交
3796
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
S
stephen hemminger 已提交
3797
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
3798
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
3799
		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
S
stephen hemminger 已提交
3800
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
D
David Stevens 已提交
3801 3802 3803 3804
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
3805
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
S
stephen hemminger 已提交
3806 3807
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
3808
		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
3809 3810 3811 3812
		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
T
Tom Herbert 已提交
3813 3814
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
S
stephen hemminger 已提交
3815 3816 3817 3818 3819 3820
		0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	const struct vxlan_dev *vxlan = netdev_priv(dev);
3821
	const struct vxlan_rdst *dst = &vxlan->default_dst;
3822
	struct ifla_vxlan_port_range ports = {
3823 3824
		.low =  htons(vxlan->cfg.port_min),
		.high = htons(vxlan->cfg.port_max),
3825
	};
S
stephen hemminger 已提交
3826

3827
	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
S
stephen hemminger 已提交
3828 3829
		goto nla_put_failure;

C
Cong Wang 已提交
3830 3831
	if (!vxlan_addr_any(&dst->remote_ip)) {
		if (dst->remote_ip.sa.sa_family == AF_INET) {
3832 3833
			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
					    dst->remote_ip.sin.sin_addr.s_addr))
C
Cong Wang 已提交
3834 3835 3836
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
3837 3838
			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
					     &dst->remote_ip.sin6.sin6_addr))
C
Cong Wang 已提交
3839 3840 3841 3842
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
3843

3844
	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
S
stephen hemminger 已提交
3845 3846
		goto nla_put_failure;

3847 3848
	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
3849
			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
3850
					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
C
Cong Wang 已提交
3851 3852 3853
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
3854
			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
3855
					     &vxlan->cfg.saddr.sin6.sin6_addr))
C
Cong Wang 已提交
3856 3857 3858 3859
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
3860

3861
	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
H
Hangbin Liu 已提交
3862 3863
	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
3864
	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
3865
	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
3866
	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
D
David Stevens 已提交
3867
	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
3868
			!!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
D
David Stevens 已提交
3869
	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
3870 3871 3872
			!!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
	    nla_put_u8(skb, IFLA_VXLAN_RSC,
		       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
D
David Stevens 已提交
3873
	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
3874
			!!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
D
David Stevens 已提交
3875
	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
3876
			!!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
3877
	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
3878
		       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
3879 3880 3881
	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
3882
	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
3883
			!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
3884
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
3885
			!!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
3886
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
3887
			!!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
T
Tom Herbert 已提交
3888
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
3889
			!!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
T
Tom Herbert 已提交
3890
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
3891
			!!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
S
stephen hemminger 已提交
3892 3893
		goto nla_put_failure;

3894 3895 3896
	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
		goto nla_put_failure;

3897
	if (vxlan->cfg.flags & VXLAN_F_GBP &&
T
Thomas Graf 已提交
3898 3899 3900
	    nla_put_flag(skb, IFLA_VXLAN_GBP))
		goto nla_put_failure;

3901
	if (vxlan->cfg.flags & VXLAN_F_GPE &&
J
Jiri Benc 已提交
3902 3903 3904
	    nla_put_flag(skb, IFLA_VXLAN_GPE))
		goto nla_put_failure;

3905
	if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
3906 3907 3908
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

S
stephen hemminger 已提交
3909 3910 3911 3912 3913 3914
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

3915 3916 3917 3918 3919 3920 3921
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	return vxlan->net;
}

S
stephen hemminger 已提交
3922 3923 3924 3925 3926 3927 3928 3929
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
	.kind		= "vxlan",
	.maxtype	= IFLA_VXLAN_MAX,
	.policy		= vxlan_policy,
	.priv_size	= sizeof(struct vxlan_dev),
	.setup		= vxlan_setup,
	.validate	= vxlan_validate,
	.newlink	= vxlan_newlink,
R
Roopa Prabhu 已提交
3930
	.changelink	= vxlan_changelink,
S
stephen hemminger 已提交
3931 3932 3933
	.dellink	= vxlan_dellink,
	.get_size	= vxlan_get_size,
	.fill_info	= vxlan_fill_info,
3934
	.get_link_net	= vxlan_get_link_net,
S
stephen hemminger 已提交
3935 3936
};

3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947
struct net_device *vxlan_dev_create(struct net *net, const char *name,
				    u8 name_assign_type,
				    struct vxlan_config *conf)
{
	struct nlattr *tb[IFLA_MAX + 1];
	struct net_device *dev;
	int err;

	memset(&tb, 0, sizeof(tb));

	dev = rtnl_create_link(net, name, name_assign_type,
3948
			       &vxlan_link_ops, tb, NULL);
3949 3950 3951
	if (IS_ERR(dev))
		return dev;

3952
	err = __vxlan_dev_create(net, dev, conf, NULL);
3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970
	if (err < 0) {
		free_netdev(dev);
		return ERR_PTR(err);
	}

	err = rtnl_configure_link(dev, NULL);
	if (err < 0) {
		LIST_HEAD(list_kill);

		vxlan_dellink(dev, &list_kill);
		unregister_netdevice_many(&list_kill);
		return ERR_PTR(err);
	}

	return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
					     struct net_device *dev)
{
	struct vxlan_dev *vxlan, *next;
	LIST_HEAD(list_kill);

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		struct vxlan_rdst *dst = &vxlan->default_dst;

		/* In case we created vxlan device with carrier
		 * and we loose the carrier due to module unload
		 * we also need to remove vxlan device. In other
		 * cases, it's not necessary and remote_ifindex
		 * is 0 here, so no matches.
		 */
		if (dst->remote_ifindex == dev->ifindex)
			vxlan_dellink(vxlan->dev, &list_kill);
	}

	unregister_netdevice_many(&list_kill);
}

3993 3994
static int vxlan_netdevice_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
3995 3996
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3997
	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
3998

3999 4000
	if (event == NETDEV_UNREGISTER) {
		vxlan_offload_rx_ports(dev, false);
4001
		vxlan_handle_lowerdev_unregister(vn, dev);
4002 4003 4004 4005
	} else if (event == NETDEV_REGISTER) {
		vxlan_offload_rx_ports(dev, true);
	} else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
		   event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4006
		vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4007
	}
4008 4009 4010 4011 4012

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
4013
	.notifier_call = vxlan_netdevice_event,
4014 4015
};

4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042
static void
vxlan_fdb_offloaded_set(struct net_device *dev,
			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;

	spin_lock_bh(&vxlan->hash_lock);

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		goto out;

	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
				   fdb_info->remote_port,
				   fdb_info->remote_vni,
				   fdb_info->remote_ifindex);
	if (!rdst)
		goto out;

	rdst->offloaded = fdb_info->offloaded;

out:
	spin_unlock_bh(&vxlan->hash_lock);
}

P
Petr Machata 已提交
4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091
static int
vxlan_fdb_external_learn_add(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

	spin_lock_bh(&vxlan->hash_lock);
	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
			       NUD_REACHABLE,
			       NLM_F_CREATE | NLM_F_REPLACE,
			       fdb_info->remote_port,
			       fdb_info->vni,
			       fdb_info->remote_vni,
			       fdb_info->remote_ifindex,
			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
			       false);
	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	int err = 0;

	spin_lock_bh(&vxlan->hash_lock);

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		err = -ENOENT;
	else if (f->flags & NTF_EXT_LEARNED)
		err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
					 fdb_info->remote_ip,
					 fdb_info->remote_port,
					 fdb_info->vni,
					 fdb_info->remote_vni,
					 fdb_info->remote_ifindex,
					 false);

	spin_unlock_bh(&vxlan->hash_lock);

	return err;
}

4092 4093 4094 4095
static int vxlan_switchdev_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
{
	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
P
Petr Machata 已提交
4096 4097
	struct switchdev_notifier_vxlan_fdb_info *fdb_info;
	int err = 0;
4098 4099 4100 4101 4102

	switch (event) {
	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
		vxlan_fdb_offloaded_set(dev, ptr);
		break;
P
Petr Machata 已提交
4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122
	case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_add(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = true;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
	case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_del(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = false;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
4123 4124
	}

P
Petr Machata 已提交
4125
	return err;
4126 4127 4128 4129 4130 4131
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
	.notifier_call = vxlan_switchdev_event,
};

S
stephen hemminger 已提交
4132 4133 4134
static __net_init int vxlan_init_net(struct net *net)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4135
	unsigned int h;
S
stephen hemminger 已提交
4136

4137
	INIT_LIST_HEAD(&vn->vxlan_list);
4138
	spin_lock_init(&vn->sock_lock);
S
stephen hemminger 已提交
4139

4140 4141
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vn->sock_list[h]);
S
stephen hemminger 已提交
4142 4143 4144 4145

	return 0;
}

4146
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
N
Nicolas Dichtel 已提交
4147 4148 4149 4150
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan, *next;
	struct net_device *dev, *aux;
4151
	unsigned int h;
N
Nicolas Dichtel 已提交
4152 4153 4154

	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &vxlan_link_ops)
4155
			unregister_netdevice_queue(dev, head);
N
Nicolas Dichtel 已提交
4156 4157 4158 4159 4160

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		/* If vxlan->dev is in the same netns, it has already been added
		 * to the list by the previous loop.
		 */
T
Tom Herbert 已提交
4161 4162
		if (!net_eq(dev_net(vxlan->dev), net)) {
			gro_cells_destroy(&vxlan->gro_cells);
4163
			unregister_netdevice_queue(vxlan->dev, head);
T
Tom Herbert 已提交
4164
		}
N
Nicolas Dichtel 已提交
4165 4166
	}

4167 4168
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
N
Nicolas Dichtel 已提交
4169 4170
}

4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183
static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
{
	struct net *net;
	LIST_HEAD(list);

	rtnl_lock();
	list_for_each_entry(net, net_list, exit_list)
		vxlan_destroy_tunnels(net, &list);

	unregister_netdevice_many(&list);
	rtnl_unlock();
}

S
stephen hemminger 已提交
4184 4185
static struct pernet_operations vxlan_net_ops = {
	.init = vxlan_init_net,
4186
	.exit_batch = vxlan_exit_batch_net,
S
stephen hemminger 已提交
4187 4188 4189 4190 4191 4192 4193 4194 4195 4196
	.id   = &vxlan_net_id,
	.size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
	int rc;

	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

4197
	rc = register_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4198 4199 4200
	if (rc)
		goto out1;

4201
	rc = register_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4202 4203 4204
	if (rc)
		goto out2;

4205
	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4206 4207
	if (rc)
		goto out3;
S
stephen hemminger 已提交
4208

4209 4210 4211 4212
	rc = rtnl_link_register(&vxlan_link_ops);
	if (rc)
		goto out4;

4213
	return 0;
4214 4215
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4216 4217
out3:
	unregister_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4218
out2:
4219
	unregister_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4220 4221 4222
out1:
	return rc;
}
4223
late_initcall(vxlan_init_module);
S
stephen hemminger 已提交
4224 4225 4226

static void __exit vxlan_cleanup_module(void)
{
4227
	rtnl_link_unregister(&vxlan_link_ops);
4228
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4229
	unregister_netdevice_notifier(&vxlan_notifier_block);
4230 4231
	unregister_pernet_subsys(&vxlan_net_ops);
	/* rcu_barrier() is called by netns */
S
stephen hemminger 已提交
4232 4233 4234 4235 4236
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
4237
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
J
Jesse Brandeburg 已提交
4238
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
S
stephen hemminger 已提交
4239
MODULE_ALIAS_RTNL_LINK("vxlan");