vxlan.c 119.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
S
stephen hemminger 已提交
2
/*
R
Rami Rosen 已提交
3
 * VXLAN: Virtual eXtensible Local Area Network
S
stephen hemminger 已提交
4
 *
5
 * Copyright (c) 2012-2013 Vyatta Inc.
S
stephen hemminger 已提交
6 7 8 9 10 11 12 13 14 15 16
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Y
Yan Burman 已提交
17
#include <linux/ethtool.h>
D
David Stevens 已提交
18 19
#include <net/arp.h>
#include <net/ndisc.h>
20
#include <net/ipv6_stubs.h>
S
stephen hemminger 已提交
21 22 23 24 25 26
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
27
#include <net/tun_proto.h>
28
#include <net/vxlan.h>
29
#include <net/nexthop.h>
30

C
Cong Wang 已提交
31 32
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
33
#include <net/ip6_checksum.h>
C
Cong Wang 已提交
34
#endif
S
stephen hemminger 已提交
35 36 37

#define VXLAN_VERSION	"0.1"

38 39
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
S
stephen hemminger 已提交
40 41 42
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

43 44
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
S
Stephen Hemminger 已提交
45
 * for compatibility with early adopters.
46
 */
47 48
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
S
stephen hemminger 已提交
49 50 51 52 53 54
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

55
static unsigned int vxlan_net_id;
56
static struct rtnl_link_ops vxlan_link_ops;
57

58
static const u8 all_zeros_mac[ETH_ALEN + 2];
59

60
static int vxlan_sock_add(struct vxlan_dev *vxlan);
61

62 63
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

64 65 66 67
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
68
	spinlock_t	  sock_lock;
69 70
};

S
stephen hemminger 已提交
71 72 73 74 75 76
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
77
	struct list_head  remotes;
78
	u8		  eth_addr[ETH_ALEN];
S
stephen hemminger 已提交
79
	u16		  state;	/* see ndm_state */
80
	__be32		  vni;
P
Petr Machata 已提交
81
	u16		  flags;	/* see ndm_flags and below */
82 83
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
84
	struct vxlan_dev  __rcu *vdev;
S
stephen hemminger 已提交
85 86
};

P
Petr Machata 已提交
87 88
#define NTF_VXLAN_ADDED_BY_USER 0x100

S
stephen hemminger 已提交
89 90 91
/* salt for hash table */
static u32 vxlan_salt __read_mostly;

T
Thomas Graf 已提交
92 93
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
94 95
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
T
Thomas Graf 已提交
96 97
}

C
Cong Wang 已提交
98 99 100 101
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
102 103 104 105 106 107
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
108 109 110 111
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
112
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
113
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
J
Jiri Benc 已提交
114 115 116
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
117
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
118 119 120 121 122
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
123 124 125
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
126
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
127
{
J
Jiri Benc 已提交
128
	if (ip->sa.sa_family == AF_INET6)
129
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
J
Jiri Benc 已提交
130
	else
131
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
132 133 134 135 136 137 138
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
139
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
140 141 142 143
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
144 145 146
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
147
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
148 149 150 151 152
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
153 154 155
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
156
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
157
{
158
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
159 160 161
}
#endif

162
/* Virtual Network hash table head */
163
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
164
{
165
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
166 167 168 169
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
S
stephen hemminger 已提交
170 171 172
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

173 174 175
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

176 177 178
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
179
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
180
{
181 182
	if (rcu_access_pointer(fdb->nh))
		return NULL;
183 184 185 186 187
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
188 189
	if (rcu_access_pointer(fdb->nh))
		return NULL;
190
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
191 192
}

193 194 195 196
/* Find VXLAN socket based on network namespace, address family and UDP port
 * and enabled unshareable flags.
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
197
					  __be16 port, u32 flags, int ifindex)
198 199
{
	struct vxlan_sock *vs;
200 201

	flags &= VXLAN_F_RCV_FLAGS;
202 203

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
204
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
205
		    vxlan_get_sk_family(vs) == family &&
206 207
		    vs->flags == flags &&
		    vs->sock->sk->sk_bound_dev_if == ifindex)
208 209 210
			return vs;
	}
	return NULL;
S
stephen hemminger 已提交
211 212
}

213 214
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
S
stephen hemminger 已提交
215
{
J
Jiri Benc 已提交
216
	struct vxlan_dev_node *node;
S
stephen hemminger 已提交
217

218 219 220 221
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

J
Jiri Benc 已提交
222 223
	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
224 225 226
			continue;

		if (IS_ENABLED(CONFIG_IPV6)) {
J
Jiri Benc 已提交
227
			const struct vxlan_config *cfg = &node->vxlan->cfg;
228 229 230 231 232 233

			if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
			    cfg->remote_ifindex != ifindex)
				continue;
		}

J
Jiri Benc 已提交
234
		return node->vxlan;
S
stephen hemminger 已提交
235 236 237 238 239
	}

	return NULL;
}

P
Pravin B Shelar 已提交
240
/* Look up VNI in a per net namespace table */
241 242 243
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
					__be32 vni, sa_family_t family,
					__be16 port, u32 flags)
P
Pravin B Shelar 已提交
244 245 246
{
	struct vxlan_sock *vs;

247
	vs = vxlan_find_sock(net, family, port, flags, ifindex);
P
Pravin B Shelar 已提交
248 249 250
	if (!vs)
		return NULL;

251
	return vxlan_vs_find_vni(vs, ifindex, vni);
P
Pravin B Shelar 已提交
252 253
}

S
stephen hemminger 已提交
254 255
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
S
Stephen Hemminger 已提交
256 257 258
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
S
stephen hemminger 已提交
259 260 261
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
262
	bool send_ip, send_eth;
S
stephen hemminger 已提交
263
	struct nlmsghdr *nlh;
264
	struct nexthop *nh;
S
stephen hemminger 已提交
265
	struct ndmsg *ndm;
266 267
	int nh_family;
	u32 nh_id;
S
stephen hemminger 已提交
268 269 270 271 272 273 274

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
D
David Stevens 已提交
275 276 277

	send_eth = send_ip = true;

278 279 280 281 282 283 284 285
	rcu_read_lock();
	nh = rcu_dereference(fdb->nh);
	if (nh) {
		nh_family = nexthop_get_family(nh);
		nh_id = nh->id;
	}
	rcu_read_unlock();

D
David Stevens 已提交
286
	if (type == RTM_GETNEIGH) {
287 288 289 290
		if (rdst) {
			send_ip = !vxlan_addr_any(&rdst->remote_ip);
			ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
		} else if (nh) {
291
			ndm->ndm_family = nh_family;
292
		}
D
David Stevens 已提交
293 294 295
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
	} else
		ndm->ndm_family	= AF_BRIDGE;
S
stephen hemminger 已提交
296 297
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
298
	ndm->ndm_flags = fdb->flags;
299
	if (rdst && rdst->offloaded)
300
		ndm->ndm_flags |= NTF_OFFLOADED;
301
	ndm->ndm_type = RTN_UNICAST;
S
stephen hemminger 已提交
302

303
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
304
	    nla_put_s32(skb, NDA_LINK_NETNSID,
305
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
306 307
		goto nla_put_failure;

D
David Stevens 已提交
308
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
S
stephen hemminger 已提交
309
		goto nla_put_failure;
310
	if (nh) {
311
		if (nla_put_u32(skb, NDA_NH_ID, nh_id))
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
			goto nla_put_failure;
	} else if (rdst) {
		if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
						  &rdst->remote_ip))
			goto nla_put_failure;

		if (rdst->remote_port &&
		    rdst->remote_port != vxlan->cfg.dst_port &&
		    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
			goto nla_put_failure;
		if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
		    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
			goto nla_put_failure;
		if (rdst->remote_ifindex &&
		    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
			goto nla_put_failure;
	}
S
stephen hemminger 已提交
329

330
	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
331 332 333
	    nla_put_u32(skb, NDA_SRC_VNI,
			be32_to_cpu(fdb->vni)))
		goto nla_put_failure;
S
stephen hemminger 已提交
334 335 336 337 338 339 340 341 342

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

343 344
	nlmsg_end(skb, nlh);
	return 0;
S
stephen hemminger 已提交
345 346 347 348 349 350 351 352 353 354

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
C
Cong Wang 已提交
355
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
356
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
357 358
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
359
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
S
stephen hemminger 已提交
360 361 362
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

P
Petr Machata 已提交
363 364
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       struct vxlan_rdst *rd, int type)
S
stephen hemminger 已提交
365 366 367 368 369 370 371 372 373
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

374
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
S
stephen hemminger 已提交
375 376 377 378 379 380 381 382 383 384 385 386 387 388
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

389 390 391
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
			    const struct vxlan_fdb *fdb,
			    const struct vxlan_rdst *rd,
392
			    struct netlink_ext_ack *extack,
393 394 395
			    struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	fdb_info->info.dev = vxlan->dev;
396
	fdb_info->info.extack = extack;
397 398 399 400 401 402 403 404 405 406
	fdb_info->remote_ip = rd->remote_ip;
	fdb_info->remote_port = rd->remote_port;
	fdb_info->remote_vni = rd->remote_vni;
	fdb_info->remote_ifindex = rd->remote_ifindex;
	memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
	fdb_info->vni = fdb->vni;
	fdb_info->offloaded = rd->offloaded;
	fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

407 408 409
static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
					      struct vxlan_fdb *fdb,
					      struct vxlan_rdst *rd,
410 411
					      bool adding,
					      struct netlink_ext_ack *extack)
P
Petr Machata 已提交
412 413 414
{
	struct switchdev_notifier_vxlan_fdb_info info;
	enum switchdev_notifier_type notifier_type;
415
	int ret;
P
Petr Machata 已提交
416 417

	if (WARN_ON(!rd))
418
		return 0;
P
Petr Machata 已提交
419 420 421

	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
422
	vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
423
	ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
424
				       &info.info, extack);
425
	return notifier_to_errno(ret);
P
Petr Machata 已提交
426 427
}

428
static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
429 430
			    struct vxlan_rdst *rd, int type, bool swdev_notify,
			    struct netlink_ext_ack *extack)
P
Petr Machata 已提交
431
{
432 433
	int err;

434
	if (swdev_notify && rd) {
435 436
		switch (type) {
		case RTM_NEWNEIGH:
437
			err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
438
								 true, extack);
439 440
			if (err)
				return err;
441 442 443
			break;
		case RTM_DELNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
444
							   false, extack);
445 446
			break;
		}
P
Petr Machata 已提交
447 448 449
	}

	__vxlan_fdb_notify(vxlan, fdb, rd, type);
450
	return 0;
P
Petr Machata 已提交
451 452
}

C
Cong Wang 已提交
453
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
D
David Stevens 已提交
454 455
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
456 457 458 459
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
C
Cong Wang 已提交
460
		.remote_ip = *ipa, /* goes to NDA_DST */
461
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
462
	};
463

464
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
465 466 467 468
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
469 470 471
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
472
	struct vxlan_rdst remote = { };
D
David Stevens 已提交
473 474 475

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

476
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
477 478
}

S
stephen hemminger 已提交
479 480 481 482 483 484 485 486
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
487 488
#else
	value <<= 16;
S
stephen hemminger 已提交
489 490 491 492
#endif
	return hash_64(value, FDB_HASH_BITS);
}

493 494 495 496 497 498 499 500
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));

	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

501 502 503 504 505 506 507 508
static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
		return eth_vni_hash(mac, vni);
	else
		return eth_hash(mac);
}

S
stephen hemminger 已提交
509 510
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
511
						const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
512
{
513
	return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
S
stephen hemminger 已提交
514 515 516
}

/* Look up Ethernet address in forwarding table */
517
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
518
					  const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
519
{
520
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
S
stephen hemminger 已提交
521 522
	struct vxlan_fdb *f;

523
	hlist_for_each_entry_rcu(f, head, hlist) {
524
		if (ether_addr_equal(mac, f->eth_addr)) {
525
			if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
526 527 528 529 530 531
				if (vni == f->vni)
					return f;
			} else {
				return f;
			}
		}
S
stephen hemminger 已提交
532 533 534 535 536
	}

	return NULL;
}

537
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
538
					const u8 *mac, __be32 vni)
539 540 541
{
	struct vxlan_fdb *f;

542
	f = __vxlan_find_mac(vxlan, mac, vni);
543
	if (f && f->used != jiffies)
544 545 546 547 548
		f->used = jiffies;

	return f;
}

549 550
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
C
Cong Wang 已提交
551
					      union vxlan_addr *ip, __be16 port,
552
					      __be32 vni, __u32 ifindex)
553
{
554
	struct vxlan_rdst *rd;
555

556
	list_for_each_entry(rd, &f->remotes, list) {
C
Cong Wang 已提交
557
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
558 559 560
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
561
			return rd;
562
	}
563

564 565 566
	return NULL;
}

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	u8 eth_addr[ETH_ALEN + 2] = { 0 };
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	int rc = 0;

	if (is_multicast_ether_addr(mac) ||
	    is_zero_ether_addr(mac))
		return -EINVAL;

	ether_addr_copy(eth_addr, mac);

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, eth_addr, vni);
	if (!f) {
		rc = -ENOENT;
		goto out;
	}

	rdst = first_remote_rcu(f);
591
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
592 593 594 595 596 597 598

out:
	rcu_read_unlock();
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

P
Petr Machata 已提交
599 600 601
static int vxlan_fdb_notify_one(struct notifier_block *nb,
				const struct vxlan_dev *vxlan,
				const struct vxlan_fdb *f,
602 603
				const struct vxlan_rdst *rdst,
				struct netlink_ext_ack *extack)
P
Petr Machata 已提交
604 605 606 607
{
	struct switchdev_notifier_vxlan_fdb_info fdb_info;
	int rc;

608
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
P
Petr Machata 已提交
609 610 611 612 613 614
	rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
			       &fdb_info);
	return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
615 616
		     struct notifier_block *nb,
		     struct netlink_ext_ack *extack)
P
Petr Machata 已提交
617 618 619 620 621 622 623 624 625 626 627 628
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;
	int rc = 0;

	if (!netif_is_vxlan(dev))
		return -EINVAL;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
629
		spin_lock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
630 631 632 633
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
			if (f->vni == vni) {
				list_for_each_entry(rdst, &f->remotes, list) {
					rc = vxlan_fdb_notify_one(nb, vxlan,
634 635
								  f, rdst,
								  extack);
P
Petr Machata 已提交
636
					if (rc)
637
						goto unlock;
P
Petr Machata 已提交
638 639 640
				}
			}
		}
641
		spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
642
	}
643
	return 0;
P
Petr Machata 已提交
644

645 646
unlock:
	spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
647 648 649 650
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

651 652 653 654 655 656 657 658 659 660 661 662
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;

	if (!netif_is_vxlan(dev))
		return;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
663
		spin_lock_bh(&vxlan->hash_lock[h]);
664 665 666 667
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
			if (f->vni == vni)
				list_for_each_entry(rdst, &f->remotes, list)
					rdst->offloaded = false;
668
		spin_unlock_bh(&vxlan->hash_lock[h]);
669
	}
670

671 672 673
}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

674 675
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
676
			     union vxlan_addr *ip, __be16 port, __be32 vni,
677
			     __u32 ifindex, struct vxlan_rdst *oldrd)
678 679 680 681 682 683 684 685 686 687
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
688

689
	*oldrd = *rd;
690
	dst_cache_reset(&rd->dst_cache);
C
Cong Wang 已提交
691
	rd->remote_ip = *ip;
692 693 694
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
695
	rd->offloaded = false;
696 697 698
	return 1;
}

699 700
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
701
			    union vxlan_addr *ip, __be16 port, __be32 vni,
702
			    __u32 ifindex, struct vxlan_rdst **rdp)
703 704 705 706 707 708 709
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

710 711 712
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
713 714 715 716 717 718

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

C
Cong Wang 已提交
719
	rd->remote_ip = *ip;
720
	rd->remote_port = port;
721
	rd->offloaded = false;
722 723
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
724 725 726

	list_add_tail_rcu(&rd->list, &f->remotes);

727
	*rdp = rd;
728 729 730
	return 1;
}

T
Tom Herbert 已提交
731 732 733
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
734 735
					  __be32 vni_field,
					  struct gro_remcsum *grc,
736
					  bool nopartial)
T
Tom Herbert 已提交
737
{
738
	size_t start, offset;
T
Tom Herbert 已提交
739 740

	if (skb->remcsum_offload)
741
		return vh;
T
Tom Herbert 已提交
742 743 744 745

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

746 747
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
T
Tom Herbert 已提交
748

749 750
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
T
Tom Herbert 已提交
751 752 753 754 755 756

	skb->remcsum_offload = 1;

	return vh;
}

757 758 759
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
					 struct list_head *head,
					 struct sk_buff *skb)
760
{
761 762
	struct sk_buff *pp = NULL;
	struct sk_buff *p;
763
	struct vxlanhdr *vh, *vh2;
764
	unsigned int hlen, off_vx;
765
	int flush = 1;
766
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
767
	__be32 flags;
768 769 770
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
771 772 773 774 775 776 777 778 779 780

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

T
Tom Herbert 已提交
781 782
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

783
	flags = vh->vx_flags;
T
Tom Herbert 已提交
784 785 786

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
787
				       vh->vx_vni, &grc,
788 789
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
T
Tom Herbert 已提交
790 791 792 793 794

		if (!vh)
			goto out;
	}

795 796
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

797
	list_for_each_entry(p, head, list) {
798 799 800 801
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
T
Thomas Graf 已提交
802 803
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
804 805 806 807 808
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

S
Sabrina Dubroca 已提交
809
	pp = call_gro_receive(eth_gro_receive, head, skb);
810
	flush = 0;
811 812

out:
813
	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
814 815 816 817

	return pp;
}

818
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
819
{
820 821 822
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
823
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
824 825
}

826 827 828
static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
					 __u16 state, __be32 src_vni,
					 __u16 ndm_flags)
829 830 831 832 833 834 835 836 837 838
{
	struct vxlan_fdb *f;

	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	if (!f)
		return NULL;
	f->state = state;
	f->flags = ndm_flags;
	f->updated = f->used = jiffies;
	f->vni = src_vni;
839
	f->nh = NULL;
840
	RCU_INIT_POINTER(f->vdev, vxlan);
841
	INIT_LIST_HEAD(&f->nh_list);
842 843 844 845 846 847
	INIT_LIST_HEAD(&f->remotes);
	memcpy(f->eth_addr, mac, ETH_ALEN);

	return f;
}

848 849 850 851 852 853 854 855
static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
			     __be32 src_vni, struct vxlan_fdb *f)
{
	++vxlan->addrcnt;
	hlist_add_head_rcu(&f->hlist,
			   vxlan_fdb_head(vxlan, mac, src_vni));
}

856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883
static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       u32 nhid, struct netlink_ext_ack *extack)
{
	struct nexthop *old_nh = rtnl_dereference(fdb->nh);
	struct nh_group *nhg;
	struct nexthop *nh;
	int err = -EINVAL;

	if (old_nh && old_nh->id == nhid)
		return 0;

	nh = nexthop_find_by_id(vxlan->net, nhid);
	if (!nh) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		goto err_inval;
	}

	if (nh) {
		if (!nexthop_get(nh)) {
			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
			nh = NULL;
			goto err_inval;
		}
		if (!nh->is_fdb_nh) {
			NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
			goto err_inval;
		}

884 885
		nhg = rtnl_dereference(nh->nh_grp);
		if (!nh->is_group || !nhg->mpath) {
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
			NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
			goto err_inval;
		}

		/* check nexthop group family */
		switch (vxlan->default_dst.remote_ip.sa.sa_family) {
		case AF_INET:
			if (!nhg->has_v4) {
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
			break;
		case AF_INET6:
			if (nhg->has_v4) {
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
		}
	}

	if (old_nh) {
		list_del_rcu(&fdb->nh_list);
		nexthop_put(old_nh);
	}
	rcu_assign_pointer(fdb->nh, nh);
	list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
	return 1;

err_inval:
	if (nh)
		nexthop_put(nh);
	return err;
}

S
stephen hemminger 已提交
922
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
923 924
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __be16 port, __be32 src_vni,
P
Petr Machata 已提交
925
			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
926 927
			    u32 nhid, struct vxlan_fdb **fdb,
			    struct netlink_ext_ack *extack)
928 929 930 931 932 933 934 935 936 937
{
	struct vxlan_rdst *rd = NULL;
	struct vxlan_fdb *f;
	int rc;

	if (vxlan->cfg.addrmax &&
	    vxlan->addrcnt >= vxlan->cfg.addrmax)
		return -ENOSPC;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
938
	f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
939 940 941
	if (!f)
		return -ENOMEM;

942 943 944 945 946 947
	if (nhid)
		rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
	else
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
	if (rc < 0)
		goto errout;
948 949 950 951

	*fdb = f;

	return 0;
952 953 954 955

errout:
	kfree(f);
	return rc;
956 957
}

958
static void __vxlan_fdb_free(struct vxlan_fdb *f)
959 960
{
	struct vxlan_rdst *rd, *nd;
961 962 963 964 965
	struct nexthop *nh;

	nh = rcu_dereference_raw(f->nh);
	if (nh) {
		rcu_assign_pointer(f->nh, NULL);
966
		rcu_assign_pointer(f->vdev, NULL);
967 968
		nexthop_put(nh);
	}
969 970 971 972 973 974 975 976

	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
		kfree(rd);
	}
	kfree(f);
}

977 978 979 980 981 982 983
static void vxlan_fdb_free(struct rcu_head *head)
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);

	__vxlan_fdb_free(f);
}

984 985 986 987 988 989 990 991
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
			      bool do_notify, bool swdev_notify)
{
	struct vxlan_rdst *rd;

	netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
992 993 994
	if (do_notify) {
		if (rcu_access_pointer(f->nh))
			vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
995
					 swdev_notify, NULL);
996 997 998 999 1000
		else
			list_for_each_entry(rd, &f->remotes, list)
				vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
						 swdev_notify, NULL);
	}
1001 1002

	hlist_del_rcu(&f->hlist);
1003
	list_del_rcu(&f->nh_list);
1004 1005 1006
	call_rcu(&f->rcu, vxlan_fdb_free);
}

1007 1008 1009 1010 1011 1012 1013 1014
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

1015 1016 1017 1018 1019
static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
				     union vxlan_addr *ip,
				     __u16 state, __u16 flags,
				     __be16 port, __be32 vni,
				     __u32 ifindex, __u16 ndm_flags,
1020
				     struct vxlan_fdb *f, u32 nhid,
1021 1022
				     bool swdev_notify,
				     struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
1023
{
P
Petr Machata 已提交
1024
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
1025
	struct vxlan_rdst *rd = NULL;
1026
	struct vxlan_rdst oldrd;
S
stephen hemminger 已提交
1027
	int notify = 0;
1028 1029
	int rc = 0;
	int err;
S
stephen hemminger 已提交
1030

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
	if (nhid && !rcu_access_pointer(f->nh)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot replace an existing non nexthop fdb with a nexthop");
		return -EOPNOTSUPP;
	}

	if (nhid && (flags & NLM_F_APPEND)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot append to a nexthop fdb");
		return -EOPNOTSUPP;
	}

1043 1044 1045 1046 1047 1048 1049 1050 1051
	/* Do not allow an externally learned entry to take over an entry added
	 * by the user.
	 */
	if (!(fdb_flags & NTF_EXT_LEARNED) ||
	    !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
		if (f->state != state) {
			f->state = state;
			f->updated = jiffies;
			notify = 1;
1052
		}
1053 1054 1055 1056
		if (f->flags != fdb_flags) {
			f->flags = fdb_flags;
			f->updated = jiffies;
			notify = 1;
1057
		}
1058
	}
1059

1060 1061 1062 1063
	if ((flags & NLM_F_REPLACE)) {
		/* Only change unicasts */
		if (!(is_multicast_ether_addr(f->eth_addr) ||
		      is_zero_ether_addr(f->eth_addr))) {
1064 1065 1066 1067 1068 1069 1070 1071
			if (nhid) {
				rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
				if (rc < 0)
					return rc;
			} else {
				rc = vxlan_fdb_replace(f, ip, port, vni,
						       ifindex, &oldrd);
			}
1072
			notify |= rc;
1073
		} else {
1074
			NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
1075
			return -EOPNOTSUPP;
1076 1077 1078 1079 1080 1081
		}
	}
	if ((flags & NLM_F_APPEND) &&
	    (is_multicast_ether_addr(f->eth_addr) ||
	     is_zero_ether_addr(f->eth_addr))) {
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
1082

1083
		if (rc < 0)
1084
			return rc;
1085
		notify |= rc;
S
stephen hemminger 已提交
1086 1087
	}

1088 1089 1090
	if (ndm_flags & NTF_USE)
		f->used = jiffies;

1091 1092 1093
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
1094

1095
		err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
1096
				       swdev_notify, extack);
1097 1098
		if (err)
			goto err_notify;
1099
	}
S
stephen hemminger 已提交
1100 1101

	return 0;
1102 1103

err_notify:
1104 1105
	if (nhid)
		return err;
1106 1107
	if ((flags & NLM_F_REPLACE) && rc)
		*rd = oldrd;
1108
	else if ((flags & NLM_F_APPEND) && rc) {
1109
		list_del_rcu(&rd->list);
1110 1111
		call_rcu(&rd->rcu, vxlan_dst_free);
	}
1112
	return err;
S
stephen hemminger 已提交
1113 1114
}

1115 1116 1117 1118
static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
				   const u8 *mac, union vxlan_addr *ip,
				   __u16 state, __u16 flags,
				   __be16 port, __be32 src_vni, __be32 vni,
1119
				   __u32 ifindex, __u16 ndm_flags, u32 nhid,
1120 1121
				   bool swdev_notify,
				   struct netlink_ext_ack *extack)
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
{
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
	struct vxlan_fdb *f;
	int rc;

	/* Disallow replace to add a multicast entry */
	if ((flags & NLM_F_REPLACE) &&
	    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
		return -EOPNOTSUPP;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
	rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
1134
			      vni, ifindex, fdb_flags, nhid, &f, extack);
1135 1136 1137
	if (rc < 0)
		return rc;

1138
	vxlan_fdb_insert(vxlan, mac, src_vni, f);
1139
	rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
1140
			      swdev_notify, extack);
1141 1142 1143
	if (rc)
		goto err_notify;

1144
	return 0;
1145 1146 1147 1148

err_notify:
	vxlan_fdb_destroy(vxlan, f, false, false);
	return rc;
1149 1150 1151 1152 1153 1154 1155
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __u16 flags,
			    __be16 port, __be32 src_vni, __be32 vni,
1156
			    __u32 ifindex, __u16 ndm_flags, u32 nhid,
1157 1158
			    bool swdev_notify,
			    struct netlink_ext_ack *extack)
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
{
	struct vxlan_fdb *f;

	f = __vxlan_find_mac(vxlan, mac, src_vni);
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}

		return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
						 vni, ifindex, ndm_flags, f,
1172
						 nhid, swdev_notify, extack);
1173 1174 1175 1176 1177 1178
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

		return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
					       port, src_vni, vni, ifindex,
1179 1180
					       ndm_flags, nhid, swdev_notify,
					       extack);
1181 1182 1183
	}
}

1184
static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
1185
				  struct vxlan_rdst *rd, bool swdev_notify)
1186 1187
{
	list_del_rcu(&rd->list);
1188
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
1189 1190 1191
	call_rcu(&rd->rcu, vxlan_dst_free);
}

M
Mike Rapoport 已提交
1192
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
1193
			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
1194
			   __be32 *vni, u32 *ifindex, u32 *nhid)
S
stephen hemminger 已提交
1195
{
1196
	struct net *net = dev_net(vxlan->dev);
C
Cong Wang 已提交
1197
	int err;
S
stephen hemminger 已提交
1198

1199 1200 1201 1202
	if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] ||
	    tb[NDA_PORT]))
		return -EINVAL;

M
Mike Rapoport 已提交
1203
	if (tb[NDA_DST]) {
C
Cong Wang 已提交
1204 1205 1206
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
M
Mike Rapoport 已提交
1207
	} else {
C
Cong Wang 已提交
1208
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
1209

C
Cong Wang 已提交
1210 1211 1212 1213 1214 1215 1216 1217 1218
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
M
Mike Rapoport 已提交
1219
	}
S
stephen hemminger 已提交
1220

1221
	if (tb[NDA_PORT]) {
1222
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
1223
			return -EINVAL;
M
Mike Rapoport 已提交
1224 1225
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
1226
		*port = vxlan->cfg.dst_port;
M
Mike Rapoport 已提交
1227
	}
1228 1229 1230 1231

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
1232
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
M
Mike Rapoport 已提交
1233 1234 1235
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
1236

1237 1238 1239 1240 1241 1242 1243 1244
	if (tb[NDA_SRC_VNI]) {
		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
			return -EINVAL;
		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
	} else {
		*src_vni = vxlan->default_dst.remote_vni;
	}

1245
	if (tb[NDA_IFINDEX]) {
P
Pravin B Shelar 已提交
1246
		struct net_device *tdev;
1247 1248 1249

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
M
Mike Rapoport 已提交
1250
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1251
		tdev = __dev_get_by_index(net, *ifindex);
P
Pravin B Shelar 已提交
1252
		if (!tdev)
1253
			return -EADDRNOTAVAIL;
M
Mike Rapoport 已提交
1254 1255 1256 1257
	} else {
		*ifindex = 0;
	}

1258 1259 1260 1261 1262
	if (tb[NDA_NH_ID])
		*nhid = nla_get_u32(tb[NDA_NH_ID]);
	else
		*nhid = 0;

M
Mike Rapoport 已提交
1263 1264 1265 1266 1267 1268
	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
1269 1270
			 const unsigned char *addr, u16 vid, u16 flags,
			 struct netlink_ext_ack *extack)
M
Mike Rapoport 已提交
1271 1272 1273
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
C
Cong Wang 已提交
1274
	union vxlan_addr ip;
M
Mike Rapoport 已提交
1275
	__be16 port;
1276
	__be32 src_vni, vni;
1277
	u32 ifindex, nhid;
1278
	u32 hash_index;
M
Mike Rapoport 已提交
1279 1280 1281 1282 1283 1284 1285 1286
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

1287
	if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
M
Mike Rapoport 已提交
1288 1289
		return -EINVAL;

1290 1291
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
M
Mike Rapoport 已提交
1292 1293
	if (err)
		return err;
1294

1295 1296 1297
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

1298 1299
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1300
	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
P
Petr Machata 已提交
1301 1302
			       port, src_vni, vni, ifindex,
			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1303
			       nhid, true, extack);
1304
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1305 1306 1307 1308

	return err;
}

1309 1310
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
			      const unsigned char *addr, union vxlan_addr ip,
1311
			      __be16 port, __be32 src_vni, __be32 vni,
1312
			      u32 ifindex, bool swdev_notify)
S
stephen hemminger 已提交
1313
{
1314
	struct vxlan_rdst *rd = NULL;
1315
	struct vxlan_fdb *f;
1316
	int err = -ENOENT;
1317

1318
	f = vxlan_find_mac(vxlan, addr, src_vni);
1319
	if (!f)
1320
		return err;
1321

C
Cong Wang 已提交
1322 1323
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1324 1325 1326 1327 1328 1329 1330 1331
		if (!rd)
			goto out;
	}

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
1332
		vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1333
		goto out;
S
stephen hemminger 已提交
1334
	}
1335

1336
	vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1337 1338

out:
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
			    const unsigned char *addr, u16 vid)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	union vxlan_addr ip;
	__be32 src_vni, vni;
1350
	u32 ifindex, nhid;
1351
	u32 hash_index;
1352
	__be16 port;
1353 1354
	int err;

1355 1356
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
1357 1358 1359
	if (err)
		return err;

1360 1361
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1362 1363
	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
				 true);
1364
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1365 1366 1367 1368 1369 1370

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1371
			  struct net_device *dev,
1372
			  struct net_device *filter_dev, int *idx)
S
stephen hemminger 已提交
1373 1374 1375
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
1376
	int err = 0;
S
stephen hemminger 已提交
1377 1378 1379 1380

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

1381
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1382 1383
			struct vxlan_rdst *rd;

1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
			if (rcu_access_pointer(f->nh)) {
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, NULL);
				if (err < 0)
					goto out;
				continue;
			}

1395
			list_for_each_entry_rcu(rd, &f->remotes, list) {
1396
				if (*idx < cb->args[2])
1397 1398
					goto skip;

1399 1400 1401 1402 1403
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
1404
				if (err < 0)
1405 1406
					goto out;
skip:
1407
				*idx += 1;
1408
			}
S
stephen hemminger 已提交
1409 1410
		}
	}
1411
out:
1412
	return err;
S
stephen hemminger 已提交
1413 1414
}

R
Roopa Prabhu 已提交
1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447
static int vxlan_fdb_get(struct sk_buff *skb,
			 struct nlattr *tb[],
			 struct net_device *dev,
			 const unsigned char *addr,
			 u16 vid, u32 portid, u32 seq,
			 struct netlink_ext_ack *extack)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	__be32 vni;
	int err;

	if (tb[NDA_VNI])
		vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
	else
		vni = vxlan->default_dst.remote_vni;

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, addr, vni);
	if (!f) {
		NL_SET_ERR_MSG(extack, "Fdb entry not found");
		err = -ENOENT;
		goto errout;
	}

	err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
			     RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
	rcu_read_unlock();
	return err;
}

S
stephen hemminger 已提交
1448 1449
/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
1450
 * Return true if packet is bogus and should be dropped.
S
stephen hemminger 已提交
1451
 */
1452
static bool vxlan_snoop(struct net_device *dev,
1453
			union vxlan_addr *src_ip, const u8 *src_mac,
1454
			u32 src_ifindex, __be32 vni)
S
stephen hemminger 已提交
1455 1456 1457
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
1458 1459 1460 1461 1462 1463 1464
	u32 ifindex = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (src_ip->sa.sa_family == AF_INET6 &&
	    (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
		ifindex = src_ifindex;
#endif
S
stephen hemminger 已提交
1465

1466
	f = vxlan_find_mac(vxlan, src_mac, vni);
S
stephen hemminger 已提交
1467
	if (likely(f)) {
1468
		struct vxlan_rdst *rdst = first_remote_rcu(f);
1469

1470 1471
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
			   rdst->remote_ifindex == ifindex))
1472 1473 1474
			return false;

		/* Don't migrate static entries, drop packets */
1475
		if (f->state & (NUD_PERMANENT | NUD_NOARP))
1476
			return true;
S
stephen hemminger 已提交
1477

1478 1479 1480 1481
		/* Don't override an fdb with nexthop with a learnt entry */
		if (rcu_access_pointer(f->nh))
			return true;

S
stephen hemminger 已提交
1482 1483
		if (net_ratelimit())
			netdev_info(dev,
C
Cong Wang 已提交
1484
				    "%pM migrated from %pIS to %pIS\n",
1485
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
S
stephen hemminger 已提交
1486

C
Cong Wang 已提交
1487
		rdst->remote_ip = *src_ip;
S
stephen hemminger 已提交
1488
		f->updated = jiffies;
1489
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
S
stephen hemminger 已提交
1490
	} else {
1491 1492
		u32 hash_index = fdb_head_index(vxlan, src_mac, vni);

S
stephen hemminger 已提交
1493
		/* learned new entry */
1494
		spin_lock(&vxlan->hash_lock[hash_index]);
1495 1496 1497

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
1498
			vxlan_fdb_update(vxlan, src_mac, src_ip,
1499 1500
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
1501
					 vxlan->cfg.dst_port,
1502
					 vni,
1503
					 vxlan->default_dst.remote_vni,
1504
					 ifindex, NTF_SELF, 0, true, NULL);
1505
		spin_unlock(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1506
	}
1507 1508

	return false;
S
stephen hemminger 已提交
1509 1510 1511
}

/* See if multicast group is already in use by other ID */
1512
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
S
stephen hemminger 已提交
1513
{
1514
	struct vxlan_dev *vxlan;
1515
	struct vxlan_sock *sock4;
A
Arnd Bergmann 已提交
1516 1517 1518
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
1519
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
S
stephen hemminger 已提交
1520

1521 1522
	sock4 = rtnl_dereference(dev->vn4_sock);

1523 1524 1525
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
1526
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1527
		return false;
1528
#if IS_ENABLED(CONFIG_IPV6)
1529
	sock6 = rtnl_dereference(dev->vn6_sock);
1530
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1531 1532
		return false;
#endif
1533

1534
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1535
		if (!netif_running(vxlan->dev) || vxlan == dev)
1536
			continue;
S
stephen hemminger 已提交
1537

1538 1539
		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
1540
			continue;
1541
#if IS_ENABLED(CONFIG_IPV6)
1542 1543
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
1544 1545
			continue;
#endif
1546 1547 1548 1549 1550 1551 1552 1553 1554 1555

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
1556
	}
S
stephen hemminger 已提交
1557 1558 1559 1560

	return false;
}

1561
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1562
{
1563
	struct vxlan_net *vn;
1564

1565
	if (!vs)
1566
		return false;
1567
	if (!refcount_dec_and_test(&vs->refcnt))
1568
		return false;
S
stephen hemminger 已提交
1569

1570
	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1571
	spin_lock(&vn->sock_lock);
1572
	hlist_del_rcu(&vs->hlist);
1573
	udp_tunnel_notify_del_rx_port(vs->sock,
1574 1575
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
1576
				      UDP_TUNNEL_TYPE_VXLAN);
1577 1578
	spin_unlock(&vn->sock_lock);

1579
	return true;
S
stephen hemminger 已提交
1580 1581
}

1582 1583
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
1584
	struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1585
#if IS_ENABLED(CONFIG_IPV6)
1586 1587
	struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

1588
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1589 1590
#endif

1591
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1592 1593
	synchronize_net();

1594 1595
	vxlan_vs_del_dev(vxlan);

1596 1597 1598
	if (__vxlan_sock_release_prep(sock4)) {
		udp_tunnel_sock_release(sock4->sock);
		kfree(sock4);
1599 1600 1601
	}

#if IS_ENABLED(CONFIG_IPV6)
1602 1603 1604
	if (__vxlan_sock_release_prep(sock6)) {
		udp_tunnel_sock_release(sock6->sock);
		kfree(sock6);
1605
	}
1606 1607 1608
#endif
}

1609
/* Update multicast group membership when first VNI on
1610
 * multicast address is brought up
1611
 */
1612
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1613
{
1614
	struct sock *sk;
C
Cong Wang 已提交
1615 1616
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1617
	int ret = -EINVAL;
S
stephen hemminger 已提交
1618

C
Cong Wang 已提交
1619
	if (ip->sa.sa_family == AF_INET) {
1620
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1621 1622 1623 1624 1625
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1626
		sk = sock4->sock->sk;
1627
		lock_sock(sk);
1628
		ret = ip_mc_join_group(sk, &mreq);
1629
		release_sock(sk);
C
Cong Wang 已提交
1630 1631
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1632 1633 1634
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1635
		lock_sock(sk);
1636 1637
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
1638
		release_sock(sk);
C
Cong Wang 已提交
1639 1640
#endif
	}
S
stephen hemminger 已提交
1641

1642
	return ret;
S
stephen hemminger 已提交
1643 1644 1645
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
1646
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1647
{
1648
	struct sock *sk;
C
Cong Wang 已提交
1649 1650
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1651
	int ret = -EINVAL;
S
stephen hemminger 已提交
1652

C
Cong Wang 已提交
1653
	if (ip->sa.sa_family == AF_INET) {
1654
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1655 1656 1657 1658 1659
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1660
		sk = sock4->sock->sk;
1661
		lock_sock(sk);
1662
		ret = ip_mc_leave_group(sk, &mreq);
1663
		release_sock(sk);
C
Cong Wang 已提交
1664 1665
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1666 1667 1668
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1669
		lock_sock(sk);
1670 1671
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
1672
		release_sock(sk);
C
Cong Wang 已提交
1673 1674
#endif
	}
S
stephen hemminger 已提交
1675

1676
	return ret;
S
stephen hemminger 已提交
1677 1678
}

1679 1680
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
T
Tom Herbert 已提交
1681
{
1682
	size_t start, offset;
T
Tom Herbert 已提交
1683

1684 1685
	if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
		goto out;
1686

1687 1688
	start = vxlan_rco_start(unparsed->vx_vni);
	offset = start + vxlan_rco_offset(unparsed->vx_vni);
T
Tom Herbert 已提交
1689

1690
	if (!pskb_may_pull(skb, offset + sizeof(u16)))
J
Jiri Benc 已提交
1691
		return false;
T
Tom Herbert 已提交
1692

J
Jiri Benc 已提交
1693 1694
	skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
			    !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1695 1696 1697
out:
	unparsed->vx_flags &= ~VXLAN_HF_RCO;
	unparsed->vx_vni &= VXLAN_VNI_MASK;
J
Jiri Benc 已提交
1698
	return true;
T
Tom Herbert 已提交
1699 1700
}

1701
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1702
				struct sk_buff *skb, u32 vxflags,
1703
				struct vxlan_metadata *md)
1704
{
1705
	struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1706
	struct metadata_dst *tun_dst;
1707 1708 1709

	if (!(unparsed->vx_flags & VXLAN_HF_GBP))
		goto out;
1710 1711 1712

	md->gbp = ntohs(gbp->policy_id);

1713
	tun_dst = (struct metadata_dst *)skb_dst(skb);
1714
	if (tun_dst) {
1715
		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1716 1717
		tun_dst->u.tun_info.options_len = sizeof(*md);
	}
1718 1719 1720 1721 1722
	if (gbp->dont_learn)
		md->gbp |= VXLAN_GBP_DONT_LEARN;

	if (gbp->policy_applied)
		md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1723

1724 1725 1726
	/* In flow-based mode, GBP is carried in dst_metadata */
	if (!(vxflags & VXLAN_F_COLLECT_METADATA))
		skb->mark = md->gbp;
1727 1728
out:
	unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1729 1730
}

J
Jiri Benc 已提交
1731
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
J
Jiri Benc 已提交
1732
				__be16 *protocol,
J
Jiri Benc 已提交
1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751
				struct sk_buff *skb, u32 vxflags)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;

	/* Need to have Next Protocol set for interfaces in GPE mode. */
	if (!gpe->np_applied)
		return false;
	/* "The initial version is 0. If a receiver does not support the
	 * version indicated it MUST drop the packet.
	 */
	if (gpe->version != 0)
		return false;
	/* "When the O bit is set to 1, the packet is an OAM packet and OAM
	 * processing MUST occur." However, we don't implement OAM
	 * processing, thus drop the packet.
	 */
	if (gpe->oam_flag)
		return false;

1752 1753
	*protocol = tun_p_to_eth_p(gpe->next_protocol);
	if (!*protocol)
J
Jiri Benc 已提交
1754 1755 1756 1757 1758 1759
		return false;

	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
	return true;
}

1760 1761
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
			  struct vxlan_sock *vs,
1762
			  struct sk_buff *skb, __be32 vni)
1763 1764
{
	union vxlan_addr saddr;
1765
	u32 ifindex = skb->dev->ifindex;
1766 1767 1768 1769 1770 1771 1772

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, vxlan->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1773
		return false;
1774

1775
	/* Get address from the outer IP header */
1776
	if (vxlan_get_sk_family(vs) == AF_INET) {
1777
		saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1778 1779 1780
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1781
		saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1782 1783 1784 1785
		saddr.sa.sa_family = AF_INET6;
#endif
	}

1786
	if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1787
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1788 1789 1790 1791 1792
		return false;

	return true;
}

1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
				  struct sk_buff *skb)
{
	int err = 0;

	if (vxlan_get_sk_family(vs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
	else
		err = IP6_ECN_decapsulate(oiph, skb);
#endif

	if (unlikely(err) && log_ecn_error) {
		if (vxlan_get_sk_family(vs) == AF_INET)
			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
					     &((struct iphdr *)oiph)->saddr,
					     ((struct iphdr *)oiph)->tos);
		else
			net_info_ratelimited("non-ECT from %pI6\n",
					     &((struct ipv6hdr *)oiph)->saddr);
	}
	return err <= 1;
}

S
stephen hemminger 已提交
1817
/* Callback from net/ipv4/udp.c to receive packets */
1818
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
S
stephen hemminger 已提交
1819
{
1820
	struct pcpu_sw_netstats *stats;
1821
	struct vxlan_dev *vxlan;
P
Pravin B Shelar 已提交
1822
	struct vxlan_sock *vs;
1823
	struct vxlanhdr unparsed;
T
Thomas Graf 已提交
1824 1825
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
J
Jiri Benc 已提交
1826
	__be16 protocol = htons(ETH_P_TEB);
J
Jiri Benc 已提交
1827
	bool raw_proto = false;
1828
	void *oiph;
1829
	__be32 vni = 0;
S
stephen hemminger 已提交
1830

J
Jiri Benc 已提交
1831
	/* Need UDP and VXLAN header to be present */
1832
	if (!pskb_may_pull(skb, VXLAN_HLEN))
1833
		goto drop;
S
stephen hemminger 已提交
1834

1835
	unparsed = *vxlan_hdr(skb);
J
Jiri Benc 已提交
1836 1837 1838 1839 1840 1841
	/* VNI flag always required to be set */
	if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
			   ntohl(vxlan_hdr(skb)->vx_flags),
			   ntohl(vxlan_hdr(skb)->vx_vni));
		/* Return non vxlan pkt */
1842
		goto drop;
S
stephen hemminger 已提交
1843
	}
J
Jiri Benc 已提交
1844 1845
	unparsed.vx_flags &= ~VXLAN_HF_VNI;
	unparsed.vx_vni &= ~VXLAN_VNI_MASK;
S
stephen hemminger 已提交
1846

1847
	vs = rcu_dereference_sk_user_data(sk);
P
Pravin B Shelar 已提交
1848
	if (!vs)
S
stephen hemminger 已提交
1849 1850
		goto drop;

1851 1852
	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

1853
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1854 1855 1856
	if (!vxlan)
		goto drop;

J
Jiri Benc 已提交
1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867
	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if (vs->flags & VXLAN_F_GPE) {
		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
			goto drop;
		raw_proto = true;
	}

	if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
				   !net_eq(vxlan->net, dev_net(vxlan->dev))))
1868
		goto drop;
1869

T
Thomas Graf 已提交
1870
	if (vxlan_collect_metadata(vs)) {
1871
		struct metadata_dst *tun_dst;
J
Jiri Benc 已提交
1872

1873
		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1874
					 key32_to_tunnel_id(vni), sizeof(*md));
1875

T
Thomas Graf 已提交
1876 1877 1878
		if (!tun_dst)
			goto drop;

1879
		md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1880 1881

		skb_dst_set(skb, (struct dst_entry *)tun_dst);
T
Thomas Graf 已提交
1882 1883 1884 1885
	} else {
		memset(md, 0, sizeof(*md));
	}

1886 1887 1888 1889
	if (vs->flags & VXLAN_F_REMCSUM_RX)
		if (!vxlan_remcsum(&unparsed, skb, vs->flags))
			goto drop;
	if (vs->flags & VXLAN_F_GBP)
1890
		vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
J
Jiri Benc 已提交
1891 1892 1893
	/* Note that GBP and GPE can never be active together. This is
	 * ensured in vxlan_dev_configure.
	 */
T
Thomas Graf 已提交
1894

1895
	if (unparsed.vx_flags || unparsed.vx_vni) {
1896 1897 1898 1899
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
1900
		 * maintains compatibility with previous stack code, and also
1901 1902 1903
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */
J
Jiri Benc 已提交
1904
		goto drop;
1905 1906
	}

J
Jiri Benc 已提交
1907
	if (!raw_proto) {
1908
		if (!vxlan_set_mac(vxlan, vs, skb, vni))
J
Jiri Benc 已提交
1909 1910
			goto drop;
	} else {
1911
		skb_reset_mac_header(skb);
J
Jiri Benc 已提交
1912 1913 1914
		skb->dev = vxlan->dev;
		skb->pkt_type = PACKET_HOST;
	}
1915 1916 1917 1918 1919 1920 1921 1922 1923 1924

	oiph = skb_network_header(skb);
	skb_reset_network_header(skb);

	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		goto drop;
	}

1925 1926 1927 1928 1929 1930 1931 1932
	rcu_read_lock();

	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
		rcu_read_unlock();
		atomic_long_inc(&vxlan->dev->rx_dropped);
		goto drop;
	}

1933 1934 1935 1936 1937 1938 1939
	stats = this_cpu_ptr(vxlan->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

	gro_cells_receive(&vxlan->gro_cells, skb);
1940 1941 1942

	rcu_read_unlock();

P
Pravin B Shelar 已提交
1943 1944 1945
	return 0;

drop:
J
Jiri Benc 已提交
1946 1947 1948
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
P
Pravin B Shelar 已提交
1949 1950
}

S
Stefano Brivio 已提交
1951 1952 1953 1954 1955 1956 1957 1958
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr *hdr;
	__be32 vni;

1959
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
S
Stefano Brivio 已提交
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978
		return -EINVAL;

	hdr = vxlan_hdr(skb);

	if (!(hdr->vx_flags & VXLAN_HF_VNI))
		return -EINVAL;

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	if (!vxlan)
		return -ENOENT;

	return 0;
}

1979
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
D
David Stevens 已提交
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;
	struct neighbour *n;

	if (dev->flags & IFF_NOARP)
		goto out;

	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
		dev->stats.tx_dropped++;
		goto out;
	}
	parp = arp_hdr(skb);

	if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
	     parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
	    parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		goto out;
	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		goto out;

	n = neigh_lookup(&arp_tbl, &tip, dev);

	if (n) {
		struct vxlan_fdb *f;
		struct sk_buff	*reply;

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2026
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2027
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
D
David Stevens 已提交
2028 2029 2030 2031 2032 2033 2034 2035 2036 2037
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

		reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
				n->ha, sha);

		neigh_release(n);

2038 2039 2040
		if (reply == NULL)
			goto out;

D
David Stevens 已提交
2041 2042 2043 2044 2045 2046 2047
		skb_reset_mac_header(reply);
		__skb_pull(reply, skb_network_offset(reply));
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;
2048
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
C
Cong Wang 已提交
2049 2050
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
2051
			.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2052 2053 2054 2055
		};

		vxlan_ip_miss(dev, &ipa);
	}
D
David Stevens 已提交
2056 2057 2058 2059 2060
out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}

C
Cong Wang 已提交
2061
#if IS_ENABLED(CONFIG_IPV6)
2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
	struct neighbour *n, bool isrouter)
{
	struct net_device *dev = request->dev;
	struct sk_buff *reply;
	struct nd_msg *ns, *na;
	struct ipv6hdr *pip6;
	u8 *daddr;
	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
	int ns_olen;
	int i, len;

2074
	if (dev == NULL || !pskb_may_pull(request, request->len))
2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
		return NULL;

	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
		sizeof(*na) + na_olen + dev->needed_tailroom;
	reply = alloc_skb(len, GFP_ATOMIC);
	if (reply == NULL)
		return NULL;

	reply->protocol = htons(ETH_P_IPV6);
	reply->dev = dev;
	skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
	skb_push(reply, sizeof(struct ethhdr));
2087
	skb_reset_mac_header(reply);
2088

2089
	ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
2090 2091

	daddr = eth_hdr(request)->h_source;
2092 2093
	ns_olen = request->len - skb_network_offset(request) -
		sizeof(struct ipv6hdr) - sizeof(*ns);
2094
	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
2095 2096 2097 2098
		if (!ns->opt[i + 1]) {
			kfree_skb(reply);
			return NULL;
		}
2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111
		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
			break;
		}
	}

	/* Ethernet header */
	ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
	ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
	eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
	reply->protocol = htons(ETH_P_IPV6);

	skb_pull(reply, sizeof(struct ethhdr));
2112
	skb_reset_network_header(reply);
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126
	skb_put(reply, sizeof(struct ipv6hdr));

	/* IPv6 header */

	pip6 = ipv6_hdr(reply);
	memset(pip6, 0, sizeof(struct ipv6hdr));
	pip6->version = 6;
	pip6->priority = ipv6_hdr(request)->priority;
	pip6->nexthdr = IPPROTO_ICMPV6;
	pip6->hop_limit = 255;
	pip6->daddr = ipv6_hdr(request)->saddr;
	pip6->saddr = *(struct in6_addr *)n->primary_key;

	skb_pull(reply, sizeof(struct ipv6hdr));
2127
	skb_reset_transport_header(reply);
2128 2129

	/* Neighbor Advertisement */
2130
	na = skb_put_zero(reply, sizeof(*na) + na_olen);
2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152
	na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
	na->icmph.icmp6_router = isrouter;
	na->icmph.icmp6_override = 1;
	na->icmph.icmp6_solicited = 1;
	na->target = ns->target;
	ether_addr_copy(&na->opt[2], n->ha);
	na->opt[0] = ND_OPT_TARGET_LL_ADDR;
	na->opt[1] = na_olen >> 3;

	na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
		&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
		csum_partial(na, sizeof(*na)+na_olen, 0));

	pip6->payload_len = htons(sizeof(*na)+na_olen);

	skb_push(reply, sizeof(struct ipv6hdr));

	reply->ip_summed = CHECKSUM_UNNECESSARY;

	return reply;
}

2153
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
C
Cong Wang 已提交
2154 2155
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2156
	const struct in6_addr *daddr;
2157
	const struct ipv6hdr *iphdr;
2158
	struct inet6_dev *in6_dev;
2159 2160
	struct neighbour *n;
	struct nd_msg *msg;
C
Cong Wang 已提交
2161 2162 2163 2164 2165 2166 2167

	in6_dev = __in6_dev_get(dev);
	if (!in6_dev)
		goto out;

	iphdr = ipv6_hdr(skb);
	daddr = &iphdr->daddr;
2168
	msg = (struct nd_msg *)(iphdr + 1);
C
Cong Wang 已提交
2169

2170 2171 2172 2173 2174
	if (ipv6_addr_loopback(daddr) ||
	    ipv6_addr_is_multicast(&msg->target))
		goto out;

	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
C
Cong Wang 已提交
2175 2176 2177

	if (n) {
		struct vxlan_fdb *f;
2178
		struct sk_buff *reply;
C
Cong Wang 已提交
2179 2180 2181 2182 2183 2184

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2185
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2186 2187 2188 2189 2190 2191
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

2192 2193 2194
		reply = vxlan_na_create(skb, n,
					!!(f ? f->flags & NTF_ROUTER : 0));

C
Cong Wang 已提交
2195
		neigh_release(n);
2196 2197 2198 2199 2200 2201 2202

		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;

2203
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
2204 2205
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
2206
			.sin6.sin6_family = AF_INET6,
2207 2208
		};

C
Cong Wang 已提交
2209 2210 2211 2212 2213 2214 2215 2216 2217
		vxlan_ip_miss(dev, &ipa);
	}

out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}
#endif

D
David Stevens 已提交
2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct neighbour *n;

	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
		return false;

	n = NULL;
	switch (ntohs(eth_hdr(skb)->h_proto)) {
	case ETH_P_IP:
2229 2230 2231
	{
		struct iphdr *pip;

D
David Stevens 已提交
2232 2233 2234 2235
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			return false;
		pip = ip_hdr(skb);
		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
2236
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
C
Cong Wang 已提交
2237 2238
			union vxlan_addr ipa = {
				.sin.sin_addr.s_addr = pip->daddr,
2239
				.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2240 2241 2242 2243 2244 2245
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

D
David Stevens 已提交
2246
		break;
2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
	}
#if IS_ENABLED(CONFIG_IPV6)
	case ETH_P_IPV6:
	{
		struct ipv6hdr *pip6;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			return false;
		pip6 = ipv6_hdr(skb);
		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
2257
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2258 2259
			union vxlan_addr ipa = {
				.sin6.sin6_addr = pip6->daddr,
2260
				.sin6.sin6_family = AF_INET6,
2261 2262 2263 2264 2265 2266 2267 2268 2269
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

		break;
	}
#endif
D
David Stevens 已提交
2270 2271 2272 2273 2274 2275 2276
	default:
		return false;
	}

	if (n) {
		bool diff;

2277
		diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
D
David Stevens 已提交
2278 2279 2280 2281 2282 2283 2284
		if (diff) {
			memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
				dev->addr_len);
			memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
		}
		neigh_release(n);
		return diff;
C
Cong Wang 已提交
2285 2286
	}

D
David Stevens 已提交
2287 2288 2289
	return false;
}

2290
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
T
Thomas Graf 已提交
2291 2292 2293 2294
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

2295 2296 2297
	if (!md->gbp)
		return;

T
Thomas Graf 已提交
2298
	gbp = (struct vxlanhdr_gbp *)vxh;
2299
	vxh->vx_flags |= VXLAN_HF_GBP;
T
Thomas Graf 已提交
2300 2301 2302 2303 2304 2305 2306 2307 2308 2309

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

J
Jiri Benc 已提交
2310 2311 2312 2313 2314 2315
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
			       __be16 protocol)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

	gpe->np_applied = 1;
2316 2317 2318 2319
	gpe->next_protocol = tun_p_from_eth_p(protocol);
	if (!gpe->next_protocol)
		return -EPFNOSUPPORT;
	return 0;
J
Jiri Benc 已提交
2320 2321
}

2322 2323 2324
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
2325
			   bool udp_sum)
C
Cong Wang 已提交
2326 2327 2328 2329
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
T
Tom Herbert 已提交
2330
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
J
Jiri Benc 已提交
2331
	__be16 inner_protocol = htons(ETH_P_TEB);
T
Tom Herbert 已提交
2332

2333
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
T
Tom Herbert 已提交
2334 2335 2336 2337 2338 2339
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
2340
		     skb->csum_offset == offsetof(struct tcphdr, check)))
T
Tom Herbert 已提交
2341 2342
			type |= SKB_GSO_TUNNEL_REMCSUM;
	}
C
Cong Wang 已提交
2343 2344

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2345
			+ VXLAN_HLEN + iphdr_len;
2346 2347 2348

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
J
Jiri Benc 已提交
2349
	if (unlikely(err))
P
pravin shelar 已提交
2350
		return err;
2351

2352 2353
	err = iptunnel_handle_offloads(skb, type);
	if (err)
P
pravin shelar 已提交
2354
		return err;
2355

2356
	vxh = __skb_push(skb, sizeof(*vxh));
2357 2358
	vxh->vx_flags = VXLAN_HF_VNI;
	vxh->vx_vni = vxlan_vni_field(vni);
2359

T
Tom Herbert 已提交
2360
	if (type & SKB_GSO_TUNNEL_REMCSUM) {
2361
		unsigned int start;
T
Tom Herbert 已提交
2362

2363 2364 2365
		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
		vxh->vx_flags |= VXLAN_HF_RCO;
T
Tom Herbert 已提交
2366 2367 2368 2369 2370 2371 2372

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

2373 2374
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
J
Jiri Benc 已提交
2375 2376 2377
	if (vxflags & VXLAN_F_GPE) {
		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
		if (err < 0)
P
pravin shelar 已提交
2378
			return err;
J
Jiri Benc 已提交
2379 2380
		inner_protocol = skb->protocol;
	}
T
Thomas Graf 已提交
2381

J
Jiri Benc 已提交
2382
	skb_set_inner_protocol(skb, inner_protocol);
2383
	return 0;
2384 2385
}

2386 2387
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
				      struct vxlan_sock *sock4,
2388
				      struct sk_buff *skb, int oif, u8 tos,
2389
				      __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2390
				      struct dst_cache *dst_cache,
2391
				      const struct ip_tunnel_info *info)
2392
{
2393
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2394 2395 2396
	struct rtable *rt = NULL;
	struct flowi4 fl4;

2397 2398 2399
	if (!sock4)
		return ERR_PTR(-EIO);

2400 2401 2402
	if (tos && !info)
		use_cache = false;
	if (use_cache) {
2403 2404 2405 2406 2407
		rt = dst_cache_get_ip4(dst_cache, saddr);
		if (rt)
			return rt;
	}

2408 2409 2410 2411 2412 2413
	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
2414
	fl4.saddr = *saddr;
2415 2416
	fl4.fl4_dport = dport;
	fl4.fl4_sport = sport;
2417 2418

	rt = ip_route_output_key(vxlan->net, &fl4);
2419
	if (!IS_ERR(rt)) {
2420 2421 2422 2423 2424 2425
		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
			ip_rt_put(rt);
			return ERR_PTR(-ELOOP);
		}

2426
		*saddr = fl4.saddr;
2427 2428
		if (use_cache)
			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2429 2430 2431
	} else {
		netdev_dbg(dev, "no route to %pI4\n", &daddr);
		return ERR_PTR(-ENETUNREACH);
2432
	}
2433 2434 2435
	return rt;
}

2436 2437
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2438
					  struct net_device *dev,
2439
					  struct vxlan_sock *sock6,
2440
					  struct sk_buff *skb, int oif, u8 tos,
2441
					  __be32 label,
2442
					  const struct in6_addr *daddr,
2443
					  struct in6_addr *saddr,
2444
					  __be16 dport, __be16 sport,
2445 2446
					  struct dst_cache *dst_cache,
					  const struct ip_tunnel_info *info)
2447
{
2448
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2449 2450 2451
	struct dst_entry *ndst;
	struct flowi6 fl6;

2452 2453 2454
	if (!sock6)
		return ERR_PTR(-EIO);

2455 2456
	if (tos && !info)
		use_cache = false;
2457
	if (use_cache) {
2458 2459 2460 2461 2462
		ndst = dst_cache_get_ip6(dst_cache, saddr);
		if (ndst)
			return ndst;
	}

2463 2464 2465
	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
2466
	fl6.saddr = *saddr;
2467
	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2468 2469
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;
2470 2471
	fl6.fl6_dport = dport;
	fl6.fl6_sport = sport;
2472

2473 2474 2475
	ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk,
					       &fl6, NULL);
	if (unlikely(IS_ERR(ndst))) {
2476 2477 2478 2479 2480 2481 2482 2483 2484
		netdev_dbg(dev, "no route to %pI6\n", daddr);
		return ERR_PTR(-ENETUNREACH);
	}

	if (unlikely(ndst->dev == dev)) {
		netdev_dbg(dev, "circular route to %pI6\n", daddr);
		dst_release(ndst);
		return ERR_PTR(-ELOOP);
	}
2485 2486

	*saddr = fl6.saddr;
2487
	if (use_cache)
2488
		dst_cache_set_ip6(dst_cache, ndst, saddr);
2489 2490 2491 2492
	return ndst;
}
#endif

2493 2494
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2495
			       struct vxlan_dev *dst_vxlan, __be32 vni)
2496
{
2497
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
C
Cong Wang 已提交
2498 2499
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2500
	struct net_device *dev;
2501
	int len = skb->len;
2502

2503 2504
	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2505 2506 2507 2508 2509
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

C
Cong Wang 已提交
2510 2511 2512 2513 2514 2515 2516 2517 2518 2519
	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#endif
	}

2520 2521 2522 2523 2524 2525 2526
	rcu_read_lock();
	dev = skb->dev;
	if (unlikely(!(dev->flags & IFF_UP))) {
		kfree_skb(skb);
		goto drop;
	}

2527
	if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2528
		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
2529 2530 2531

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
2532
	tx_stats->tx_bytes += len;
2533 2534 2535 2536 2537
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
2538
		rx_stats->rx_bytes += len;
2539 2540
		u64_stats_update_end(&rx_stats->syncp);
	} else {
2541
drop:
2542
		dev->stats.rx_dropped++;
2543
	}
2544
	rcu_read_unlock();
2545 2546
}

2547
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2548 2549 2550 2551
				 struct vxlan_dev *vxlan,
				 union vxlan_addr *daddr,
				 __be16 dst_port, int dst_ifindex, __be32 vni,
				 struct dst_entry *dst,
2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566
				 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
	 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
	 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
	 */
	BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
	/* Bypass encapsulation if the destination is local */
	if (rt_flags & RTCF_LOCAL &&
	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
		struct vxlan_dev *dst_vxlan;

		dst_release(dst);
2567
		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2568
					   daddr->sa.sa_family, dst_port,
2569
					   vxlan->cfg.flags);
2570 2571 2572 2573 2574 2575
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			kfree_skb(skb);

			return -ENOENT;
		}
2576
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2577 2578 2579 2580 2581 2582
		return 1;
	}

	return 0;
}

2583
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2584 2585
			   __be32 default_vni, struct vxlan_rdst *rdst,
			   bool did_rsc)
S
stephen hemminger 已提交
2586
{
2587
	struct dst_cache *dst_cache;
2588
	struct ip_tunnel_info *info;
S
stephen hemminger 已提交
2589
	struct vxlan_dev *vxlan = netdev_priv(dev);
P
pravin shelar 已提交
2590
	const struct iphdr *old_iph = ip_hdr(skb);
C
Cong Wang 已提交
2591
	union vxlan_addr *dst;
2592
	union vxlan_addr remote_ip, local_ip;
T
Thomas Graf 已提交
2593 2594
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
C
Cong Wang 已提交
2595
	__be16 src_port = 0, dst_port;
2596
	struct dst_entry *ndst = NULL;
2597
	__be32 vni, label;
S
stephen hemminger 已提交
2598
	__u8 tos, ttl;
2599
	int ifindex;
2600
	int err;
2601
	u32 flags = vxlan->cfg.flags;
2602
	bool udp_sum = false;
2603
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
S
stephen hemminger 已提交
2604

2605
	info = skb_tunnel_info(skb);
2606

T
Thomas Graf 已提交
2607
	if (rdst) {
P
pravin shelar 已提交
2608 2609 2610 2611
		dst = &rdst->remote_ip;
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
2612
				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
P
pravin shelar 已提交
2613 2614 2615 2616 2617
				return;
			}
			goto drop;
		}

2618
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2619
		vni = (rdst->remote_vni) ? : default_vni;
2620
		ifindex = rdst->remote_ifindex;
2621
		local_ip = vxlan->cfg.saddr;
2622
		dst_cache = &rdst->dst_cache;
P
pravin shelar 已提交
2623
		md->gbp = skb->mark;
H
Hangbin Liu 已提交
2624 2625 2626 2627 2628 2629 2630
		if (flags & VXLAN_F_TTL_INHERIT) {
			ttl = ip_tunnel_get_ttl(old_iph, skb);
		} else {
			ttl = vxlan->cfg.ttl;
			if (!ttl && vxlan_addr_multicast(dst))
				ttl = 1;
		}
P
pravin shelar 已提交
2631 2632 2633 2634 2635 2636 2637 2638 2639 2640

		tos = vxlan->cfg.tos;
		if (tos == 1)
			tos = ip_tunnel_get_dsfield(old_iph, skb);

		if (dst->sa.sa_family == AF_INET)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
		label = vxlan->cfg.label;
T
Thomas Graf 已提交
2641 2642 2643 2644 2645 2646
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
2647
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2648
		if (remote_ip.sa.sa_family == AF_INET) {
2649
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2650 2651
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
2652
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2653 2654
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
T
Thomas Graf 已提交
2655
		dst = &remote_ip;
P
pravin shelar 已提交
2656 2657
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = tunnel_id_to_key32(info->key.tun_id);
2658
		ifindex = 0;
2659
		dst_cache = &info->dst_cache;
2660 2661 2662
		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
			if (info->options_len < sizeof(*md))
				goto drop;
P
pravin shelar 已提交
2663
			md = ip_tunnel_info_opts(info);
2664
		}
2665 2666
		ttl = info->key.ttl;
		tos = info->key.tos;
2667
		label = info->key.label;
2668
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2669
	}
P
pravin shelar 已提交
2670 2671
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);
2672

J
Jakub Kicinski 已提交
2673
	rcu_read_lock();
C
Cong Wang 已提交
2674
	if (dst->sa.sa_family == AF_INET) {
2675
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
P
pravin shelar 已提交
2676
		struct rtable *rt;
P
pravin shelar 已提交
2677
		__be16 df = 0;
2678

2679 2680 2681
		if (!ifindex)
			ifindex = sock4->sock->sk->sk_bound_dev_if;

2682
		rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2683
				     dst->sin.sin_addr.s_addr,
2684
				     &local_ip.sin.sin_addr.s_addr,
2685
				     dst_port, src_port,
2686
				     dst_cache, info);
2687 2688
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
P
pravin shelar 已提交
2689
			goto tx_error;
2690
		}
C
Cong Wang 已提交
2691

2692
		if (!info) {
2693
			/* Bypass encapsulation if the destination is local */
2694
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2695 2696
						    dst_port, ifindex, vni,
						    &rt->dst, rt->rt_flags);
2697
			if (err)
J
Jakub Kicinski 已提交
2698
				goto out_unlock;
2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709

			if (vxlan->cfg.df == VXLAN_DF_SET) {
				df = htons(IP_DF);
			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
				struct ethhdr *eth = eth_hdr(skb);

				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
				    (ntohs(eth->h_proto) == ETH_P_IP &&
				     old_iph->frag_off & htons(IP_DF)))
					df = htons(IP_DF);
			}
2710
		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2711
			df = htons(IP_DF);
2712
		}
2713

P
pravin shelar 已提交
2714
		ndst = &rt->dst;
2715
		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
X
Xin Long 已提交
2716

H
Hangbin Liu 已提交
2717
		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
C
Cong Wang 已提交
2718
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
P
pravin shelar 已提交
2719
		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2720
				      vni, md, flags, udp_sum);
2721
		if (err < 0)
P
pravin shelar 已提交
2722
			goto tx_error;
2723

2724
		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2725 2726
				    dst->sin.sin_addr.s_addr, tos, ttl, df,
				    src_port, dst_port, xnet, !udp_sum);
C
Cong Wang 已提交
2727 2728
#if IS_ENABLED(CONFIG_IPV6)
	} else {
2729
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
C
Cong Wang 已提交
2730

2731 2732 2733
		if (!ifindex)
			ifindex = sock6->sock->sk->sk_bound_dev_if;

2734
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2735
					label, &dst->sin6.sin6_addr,
2736
					&local_ip.sin6.sin6_addr,
2737
					dst_port, src_port,
2738
					dst_cache, info);
2739
		if (IS_ERR(ndst)) {
2740
			err = PTR_ERR(ndst);
P
pravin shelar 已提交
2741
			ndst = NULL;
2742
			goto tx_error;
C
Cong Wang 已提交
2743
		}
2744

2745 2746
		if (!info) {
			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2747

2748
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2749 2750
						    dst_port, ifindex, vni,
						    ndst, rt6i_flags);
2751
			if (err)
J
Jakub Kicinski 已提交
2752
				goto out_unlock;
2753
		}
2754

2755
		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
X
Xin Long 已提交
2756

H
Hangbin Liu 已提交
2757
		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
C
Cong Wang 已提交
2758
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
2759 2760
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2761
				      vni, md, flags, udp_sum);
P
pravin shelar 已提交
2762 2763 2764
		if (err < 0)
			goto tx_error;

P
pravin shelar 已提交
2765
		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2766
				     &local_ip.sin6.sin6_addr,
2767
				     &dst->sin6.sin6_addr, tos, ttl,
2768
				     label, src_port, dst_port, !udp_sum);
C
Cong Wang 已提交
2769 2770
#endif
	}
J
Jakub Kicinski 已提交
2771 2772
out_unlock:
	rcu_read_unlock();
2773
	return;
S
stephen hemminger 已提交
2774 2775 2776

drop:
	dev->stats.tx_dropped++;
P
pravin shelar 已提交
2777 2778
	dev_kfree_skb(skb);
	return;
S
stephen hemminger 已提交
2779 2780

tx_error:
J
Jakub Kicinski 已提交
2781
	rcu_read_unlock();
2782 2783 2784 2785
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
P
pravin shelar 已提交
2786
	dst_release(ndst);
S
stephen hemminger 已提交
2787
	dev->stats.tx_errors++;
P
pravin shelar 已提交
2788
	kfree_skb(skb);
S
stephen hemminger 已提交
2789 2790
}

2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
			  struct vxlan_fdb *f, __be32 vni, bool did_rsc)
{
	struct vxlan_rdst nh_rdst;
	struct nexthop *nh;
	bool do_xmit;
	u32 hash;

	memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
	hash = skb_get_hash(skb);

	rcu_read_lock();
	nh = rcu_dereference(f->nh);
	if (!nh) {
		rcu_read_unlock();
		goto drop;
	}
	do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
	rcu_read_unlock();

	if (likely(do_xmit))
		vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
	else
		goto drop;

	return;

drop:
	dev->stats.tx_dropped++;
	dev_kfree_skb(skb);
}

2823 2824 2825 2826 2827 2828 2829 2830 2831
/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2832
	struct vxlan_rdst *rdst, *fdst = NULL;
2833
	const struct ip_tunnel_info *info;
2834 2835
	bool did_rsc = false;
	struct vxlan_fdb *f;
2836
	struct ethhdr *eth;
2837
	__be32 vni = 0;
2838

2839
	info = skb_tunnel_info(skb);
2840

2841 2842
	skb_reset_mac_header(skb);

2843
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2844 2845 2846 2847 2848 2849 2850 2851 2852 2853
		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
		    info->mode & IP_TUNNEL_INFO_TX) {
			vni = tunnel_id_to_key32(info->key.tun_id);
		} else {
			if (info && info->mode & IP_TUNNEL_INFO_TX)
				vxlan_xmit_one(skb, dev, vni, NULL, false);
			else
				kfree_skb(skb);
			return NETDEV_TX_OK;
		}
2854 2855
	}

2856
	if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2857
		eth = eth_hdr(skb);
C
Cong Wang 已提交
2858
		if (ntohs(eth->h_proto) == ETH_P_ARP)
2859
			return arp_reduce(dev, skb, vni);
C
Cong Wang 已提交
2860
#if IS_ENABLED(CONFIG_IPV6)
2861 2862 2863 2864 2865 2866 2867 2868
		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
			 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
					    sizeof(struct nd_msg)) &&
			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

			if (m->icmph.icmp6_code == 0 &&
			    m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2869
				return neigh_reduce(dev, skb, vni);
C
Cong Wang 已提交
2870 2871 2872
		}
#endif
	}
2873

2874
	eth = eth_hdr(skb);
2875
	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2876 2877
	did_rsc = false;

2878
	if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2879 2880
	    (ntohs(eth->h_proto) == ETH_P_IP ||
	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
2881 2882
		did_rsc = route_shortcircuit(dev, skb);
		if (did_rsc)
2883
			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2884 2885
	}

2886
	if (f == NULL) {
2887
		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2888
		if (f == NULL) {
2889
			if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2890 2891 2892 2893
			    !is_multicast_ether_addr(eth->h_dest))
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
2894
			kfree_skb(skb);
2895 2896 2897
			return NETDEV_TX_OK;
		}
	}
2898

2899 2900 2901 2902 2903 2904
	if (rcu_access_pointer(f->nh)) {
		vxlan_xmit_nh(skb, dev, f,
			      (vni ? : vxlan->default_dst.remote_vni), did_rsc);
	} else {
		list_for_each_entry_rcu(rdst, &f->remotes, list) {
			struct sk_buff *skb1;
2905

2906 2907 2908 2909 2910 2911 2912
			if (!fdst) {
				fdst = rdst;
				continue;
			}
			skb1 = skb_clone(skb, GFP_ATOMIC);
			if (skb1)
				vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2913
		}
2914 2915 2916 2917
		if (fdst)
			vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
		else
			kfree_skb(skb);
2918 2919
	}

2920
	return NETDEV_TX_OK;
2921 2922
}

S
stephen hemminger 已提交
2923
/* Walk the forwarding table and purge stale entries */
2924
static void vxlan_cleanup(struct timer_list *t)
S
stephen hemminger 已提交
2925
{
2926
	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
S
stephen hemminger 已提交
2927 2928 2929 2930 2931 2932 2933 2934
	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
	unsigned int h;

	if (!netif_running(vxlan->dev))
		return;

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
2935

2936
		spin_lock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2937 2938 2939 2940 2941
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
			unsigned long timeout;

2942
			if (f->state & (NUD_PERMANENT | NUD_NOARP))
S
stephen hemminger 已提交
2943 2944
				continue;

2945 2946 2947
			if (f->flags & NTF_EXT_LEARNED)
				continue;

2948
			timeout = f->used + vxlan->cfg.age_interval * HZ;
S
stephen hemminger 已提交
2949 2950 2951 2952 2953
			if (time_before_eq(timeout, jiffies)) {
				netdev_dbg(vxlan->dev,
					   "garbage collect %pM\n",
					   f->eth_addr);
				f->state = NUD_STALE;
2954
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
2955 2956 2957
			} else if (time_before(timeout, next_timer))
				next_timer = timeout;
		}
2958
		spin_unlock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2959 2960 2961 2962 2963
	}

	mod_timer(&vxlan->age_timer, next_timer);
}

2964 2965 2966 2967 2968
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2969 2970 2971 2972
	hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
	hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
2973 2974 2975
	spin_unlock(&vn->sock_lock);
}

J
Jiri Benc 已提交
2976 2977
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
			     struct vxlan_dev_node *node)
2978
{
2979
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2980
	__be32 vni = vxlan->default_dst.remote_vni;
2981

J
Jiri Benc 已提交
2982
	node->vxlan = vxlan;
2983
	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2984
	hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
2985
	spin_unlock(&vn->sock_lock);
2986 2987
}

S
stephen hemminger 已提交
2988 2989 2990
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
2991 2992 2993
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

2994
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2995
	if (!dev->tstats)
S
stephen hemminger 已提交
2996 2997
		return -ENOMEM;

2998 2999 3000 3001 3002 3003
	err = gro_cells_init(&vxlan->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

S
stephen hemminger 已提交
3004 3005 3006
	return 0;
}

3007
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
3008 3009
{
	struct vxlan_fdb *f;
3010
	u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
3011

3012
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
3013
	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
3014
	if (f)
3015
		vxlan_fdb_destroy(vxlan, f, true, true);
3016
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
3017 3018
}

3019 3020 3021 3022
static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

3023 3024
	gro_cells_destroy(&vxlan->gro_cells);

3025
	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
3026

3027 3028 3029
	free_percpu(dev->tstats);
}

S
stephen hemminger 已提交
3030 3031 3032 3033
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3034
	int ret;
3035

3036 3037 3038
	ret = vxlan_sock_add(vxlan);
	if (ret < 0)
		return ret;
S
stephen hemminger 已提交
3039

3040
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
3041
		ret = vxlan_igmp_join(vxlan);
3042 3043
		if (ret == -EADDRINUSE)
			ret = 0;
3044
		if (ret) {
3045
			vxlan_sock_release(vxlan);
3046 3047
			return ret;
		}
S
stephen hemminger 已提交
3048 3049
	}

3050
	if (vxlan->cfg.age_interval)
S
stephen hemminger 已提交
3051 3052
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

3053
	return ret;
S
stephen hemminger 已提交
3054 3055 3056
}

/* Purge the forwarding table */
3057
static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
S
stephen hemminger 已提交
3058
{
3059
	unsigned int h;
S
stephen hemminger 已提交
3060 3061 3062

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
3063 3064

		spin_lock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3065 3066 3067
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
3068 3069
			if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
				continue;
3070 3071
			/* the all_zeros_mac entry is deleted at vxlan_uninit */
			if (!is_zero_ether_addr(f->eth_addr))
3072
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
3073
		}
3074
		spin_unlock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3075 3076 3077 3078 3079 3080 3081
	}
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
N
Nicolas Dichtel 已提交
3082
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3083
	int ret = 0;
S
stephen hemminger 已提交
3084

3085
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
3086
	    !vxlan_group_used(vn, vxlan))
3087
		ret = vxlan_igmp_leave(vxlan);
S
stephen hemminger 已提交
3088 3089 3090

	del_timer_sync(&vxlan->age_timer);

3091
	vxlan_flush(vxlan, false);
3092
	vxlan_sock_release(vxlan);
S
stephen hemminger 已提交
3093

3094
	return ret;
S
stephen hemminger 已提交
3095 3096 3097 3098 3099 3100 3101
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

3102
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
3103
{
3104 3105 3106 3107
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);
3108
	bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
3109

3110 3111 3112 3113 3114 3115 3116
	/* This check is different than dev->max_mtu, because it looks at
	 * the lowerdev->mtu, rather than the static dev->max_mtu
	 */
	if (lowerdev) {
		int max_mtu = lowerdev->mtu -
			      (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
		if (new_mtu > max_mtu)
D
David Wragg 已提交
3117 3118 3119
			return -EINVAL;
	}

3120 3121 3122 3123
	dev->mtu = new_mtu;
	return 0;
}

3124 3125 3126 3127 3128 3129 3130 3131 3132 3133
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

3134
	if (ip_tunnel_info_af(info) == AF_INET) {
3135
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
3136 3137
		struct rtable *rt;

3138
		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
3139
				     info->key.u.ipv4.dst,
3140 3141
				     &info->key.u.ipv4.src, dport, sport,
				     &info->dst_cache, info);
3142 3143 3144
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
3145 3146
	} else {
#if IS_ENABLED(CONFIG_IPV6)
3147
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
3148 3149
		struct dst_entry *ndst;

3150
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
3151
					info->key.label, &info->key.u.ipv6.dst,
3152 3153
					&info->key.u.ipv6.src, dport, sport,
					&info->dst_cache, info);
3154 3155 3156 3157 3158 3159 3160
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
		return -EPFNOSUPPORT;
#endif
	}
3161 3162
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
3163
	return 0;
3164 3165
}

3166
static const struct net_device_ops vxlan_netdev_ether_ops = {
S
stephen hemminger 已提交
3167
	.ndo_init		= vxlan_init,
3168
	.ndo_uninit		= vxlan_uninit,
S
stephen hemminger 已提交
3169 3170 3171
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
3172
	.ndo_get_stats64	= ip_tunnel_get_stats64,
S
stephen hemminger 已提交
3173
	.ndo_set_rx_mode	= vxlan_set_multicast_list,
3174
	.ndo_change_mtu		= vxlan_change_mtu,
S
stephen hemminger 已提交
3175 3176 3177 3178 3179
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_fdb_add		= vxlan_fdb_add,
	.ndo_fdb_del		= vxlan_fdb_delete,
	.ndo_fdb_dump		= vxlan_fdb_dump,
R
Roopa Prabhu 已提交
3180
	.ndo_fdb_get		= vxlan_fdb_get,
3181
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
3182
	.ndo_change_proto_down  = dev_change_proto_down_generic,
S
stephen hemminger 已提交
3183 3184
};

J
Jiri Benc 已提交
3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195
static const struct net_device_ops vxlan_netdev_raw_ops = {
	.ndo_init		= vxlan_init,
	.ndo_uninit		= vxlan_uninit,
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
	.ndo_change_mtu		= vxlan_change_mtu,
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
};

S
stephen hemminger 已提交
3196 3197 3198 3199 3200
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
	.name = "vxlan",
};

3201
/* Calls the ndo_udp_tunnel_add of the caller in order to
J
Joseph Gasparakis 已提交
3202
 * supply the listening VXLAN udp ports. Callers are expected
3203
 * to implement the ndo_udp_tunnel_add.
3204
 */
3205
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
3206 3207 3208 3209
{
	struct vxlan_sock *vs;
	struct net *net = dev_net(dev);
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
J
Joseph Gasparakis 已提交
3210
	unsigned int i;
3211 3212 3213

	spin_lock(&vn->sock_lock);
	for (i = 0; i < PORT_HASH_SIZE; ++i) {
3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226
		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
			unsigned short type;

			if (vs->flags & VXLAN_F_GPE)
				type = UDP_TUNNEL_TYPE_VXLAN_GPE;
			else
				type = UDP_TUNNEL_TYPE_VXLAN;

			if (push)
				udp_tunnel_push_rx_port(dev, vs->sock, type);
			else
				udp_tunnel_drop_rx_port(dev, vs->sock, type);
		}
3227 3228 3229 3230
	}
	spin_unlock(&vn->sock_lock);
}

S
stephen hemminger 已提交
3231 3232 3233 3234
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3235
	unsigned int h;
S
stephen hemminger 已提交
3236

3237 3238 3239
	eth_hw_addr_random(dev);
	ether_setup(dev);

3240
	dev->needs_free_netdev = true;
S
stephen hemminger 已提交
3241 3242 3243
	SET_NETDEV_DEVTYPE(dev, &vxlan_type);

	dev->features	|= NETIF_F_LLTX;
3244
	dev->features	|= NETIF_F_SG | NETIF_F_HW_CSUM;
3245
	dev->features   |= NETIF_F_RXCSUM;
3246
	dev->features   |= NETIF_F_GSO_SOFTWARE;
3247

3248
	dev->vlan_features = dev->features;
3249
	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
3250
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
3251
	netif_keep_dst(dev);
3252
	dev->priv_flags |= IFF_NO_QUEUE;
S
stephen hemminger 已提交
3253

3254 3255 3256 3257
	/* MTU range: 68 - 65535 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = ETH_MAX_MTU;

3258
	INIT_LIST_HEAD(&vxlan->next);
S
stephen hemminger 已提交
3259

3260
	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
S
stephen hemminger 已提交
3261 3262 3263

	vxlan->dev = dev;

3264 3265
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		spin_lock_init(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3266
		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
3267
	}
S
stephen hemminger 已提交
3268 3269
}

3270 3271 3272 3273 3274 3275 3276
static void vxlan_ether_setup(struct net_device *dev)
{
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
	dev->netdev_ops = &vxlan_netdev_ether_ops;
}

J
Jiri Benc 已提交
3277 3278
static void vxlan_raw_setup(struct net_device *dev)
{
3279
	dev->header_ops = NULL;
J
Jiri Benc 已提交
3280 3281 3282 3283 3284 3285 3286
	dev->type = ARPHRD_NONE;
	dev->hard_header_len = 0;
	dev->addr_len = 0;
	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
	dev->netdev_ops = &vxlan_netdev_raw_ops;
}

S
stephen hemminger 已提交
3287 3288
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
3289
	[IFLA_VXLAN_GROUP]	= { .len = sizeof_field(struct iphdr, daddr) },
C
Cong Wang 已提交
3290
	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3291
	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
3292
	[IFLA_VXLAN_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
C
Cong Wang 已提交
3293
	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3294 3295
	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
3296
	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
3297 3298 3299
	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
3300
	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
D
David Stevens 已提交
3301 3302 3303 3304
	[IFLA_VXLAN_PROXY]	= { .type = NLA_U8 },
	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
3305
	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
3306
	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
3307 3308 3309
	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
T
Tom Herbert 已提交
3310 3311
	[IFLA_VXLAN_REMCSUM_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_REMCSUM_RX]	= { .type = NLA_U8 },
T
Thomas Graf 已提交
3312
	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
J
Jiri Benc 已提交
3313
	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
3314
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
H
Hangbin Liu 已提交
3315
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
3316
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
S
stephen hemminger 已提交
3317 3318
};

3319 3320
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3321 3322 3323
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
3324 3325
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided link layer address is not Ethernet");
S
stephen hemminger 已提交
3326 3327 3328 3329
			return -EINVAL;
		}

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
3330 3331
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided Ethernet address is not unicast");
S
stephen hemminger 已提交
3332 3333 3334 3335
			return -EADDRNOTAVAIL;
		}
	}

3336
	if (tb[IFLA_MTU]) {
3337
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3338

3339 3340 3341
		if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "MTU must be between 68 and 65535");
3342
			return -EINVAL;
3343
		}
3344 3345
	}

3346 3347 3348
	if (!data) {
		NL_SET_ERR_MSG(extack,
			       "Required attributes not provided to perform the operation");
S
stephen hemminger 已提交
3349
		return -EINVAL;
3350
	}
S
stephen hemminger 已提交
3351 3352

	if (data[IFLA_VXLAN_ID]) {
3353 3354
		u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

3355
		if (id >= VXLAN_N_VID) {
3356
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
3357
					    "VXLAN ID must be lower than 16777216");
S
stephen hemminger 已提交
3358
			return -ERANGE;
3359
		}
S
stephen hemminger 已提交
3360 3361
	}

3362 3363 3364 3365 3366
	if (data[IFLA_VXLAN_PORT_RANGE]) {
		const struct ifla_vxlan_port_range *p
			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);

		if (ntohs(p->high) < ntohs(p->low)) {
3367
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
3368
					    "Invalid source port range");
3369 3370 3371 3372
			return -EINVAL;
		}
	}

3373 3374 3375 3376
	if (data[IFLA_VXLAN_DF]) {
		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

		if (df < 0 || df > VXLAN_DF_MAX) {
3377
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
3378 3379 3380 3381 3382
					    "Invalid DF attribute");
			return -EINVAL;
		}
	}

S
stephen hemminger 已提交
3383 3384 3385
	return 0;
}

Y
Yan Burman 已提交
3386 3387 3388 3389 3390 3391 3392
static void vxlan_get_drvinfo(struct net_device *netdev,
			      struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411
static int vxlan_get_link_ksettings(struct net_device *dev,
				    struct ethtool_link_ksettings *cmd)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);

	if (!lowerdev) {
		cmd->base.duplex = DUPLEX_UNKNOWN;
		cmd->base.port = PORT_OTHER;
		cmd->base.speed = SPEED_UNKNOWN;

		return 0;
	}

	return __ethtool_get_link_ksettings(lowerdev, cmd);
}

Y
Yan Burman 已提交
3412
static const struct ethtool_ops vxlan_ethtool_ops = {
3413 3414 3415
	.get_drvinfo		= vxlan_get_drvinfo,
	.get_link		= ethtool_op_get_link,
	.get_link_ksettings	= vxlan_get_link_ksettings,
Y
Yan Burman 已提交
3416 3417
};

T
Tom Herbert 已提交
3418
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3419
					__be16 port, u32 flags, int ifindex)
3420
{
C
Cong Wang 已提交
3421
	struct socket *sock;
T
Tom Herbert 已提交
3422 3423
	struct udp_port_cfg udp_conf;
	int err;
C
Cong Wang 已提交
3424

T
Tom Herbert 已提交
3425
	memset(&udp_conf, 0, sizeof(udp_conf));
C
Cong Wang 已提交
3426

T
Tom Herbert 已提交
3427 3428 3429
	if (ipv6) {
		udp_conf.family = AF_INET6;
		udp_conf.use_udp6_rx_checksums =
3430
		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3431
		udp_conf.ipv6_v6only = 1;
T
Tom Herbert 已提交
3432 3433
	} else {
		udp_conf.family = AF_INET;
C
Cong Wang 已提交
3434 3435
	}

T
Tom Herbert 已提交
3436
	udp_conf.local_udp_port = port;
3437
	udp_conf.bind_ifindex = ifindex;
3438

T
Tom Herbert 已提交
3439 3440 3441 3442
	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);
C
Cong Wang 已提交
3443

Z
Zhi Yong Wu 已提交
3444
	return sock;
C
Cong Wang 已提交
3445 3446 3447
}

/* Create new listen socket if needed */
3448
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3449 3450
					      __be16 port, u32 flags,
					      int ifindex)
C
Cong Wang 已提交
3451 3452 3453 3454 3455
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_sock *vs;
	struct socket *sock;
	unsigned int h;
3456
	struct udp_tunnel_sock_cfg tunnel_cfg;
C
Cong Wang 已提交
3457

3458
	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
C
Cong Wang 已提交
3459 3460 3461 3462 3463 3464
	if (!vs)
		return ERR_PTR(-ENOMEM);

	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vs->vni_list[h]);

3465
	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
Z
Zhi Yong Wu 已提交
3466
	if (IS_ERR(sock)) {
3467
		kfree(vs);
3468
		return ERR_CAST(sock);
3469
	}
C
Cong Wang 已提交
3470 3471

	vs->sock = sock;
3472
	refcount_set(&vs->refcnt, 1);
3473
	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3474

3475 3476
	spin_lock(&vn->sock_lock);
	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3477
	udp_tunnel_notify_add_rx_port(sock,
3478 3479
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
3480
				      UDP_TUNNEL_TYPE_VXLAN);
3481
	spin_unlock(&vn->sock_lock);
3482 3483

	/* Mark socket as an encapsulation socket. */
3484
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3485 3486
	tunnel_cfg.sk_user_data = vs;
	tunnel_cfg.encap_type = 1;
3487
	tunnel_cfg.encap_rcv = vxlan_rcv;
S
Stefano Brivio 已提交
3488
	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3489
	tunnel_cfg.encap_destroy = NULL;
3490 3491
	tunnel_cfg.gro_receive = vxlan_gro_receive;
	tunnel_cfg.gro_complete = vxlan_gro_complete;
3492 3493

	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
C
Cong Wang 已提交
3494

3495 3496 3497
	return vs;
}

3498
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3499
{
3500 3501
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	struct vxlan_sock *vs = NULL;
J
Jiri Benc 已提交
3502
	struct vxlan_dev_node *node;
3503 3504 3505 3506 3507
	int l3mdev_index = 0;

	if (vxlan->cfg.remote_ifindex)
		l3mdev_index = l3mdev_master_upper_ifindex_by_index(
			vxlan->net, vxlan->cfg.remote_ifindex);
3508

3509
	if (!vxlan->cfg.no_share) {
3510
		spin_lock(&vn->sock_lock);
3511
		vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3512 3513
				     vxlan->cfg.dst_port, vxlan->cfg.flags,
				     l3mdev_index);
3514
		if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3515
			spin_unlock(&vn->sock_lock);
3516
			return -EBUSY;
3517 3518 3519
		}
		spin_unlock(&vn->sock_lock);
	}
3520
	if (!vs)
3521
		vs = vxlan_socket_create(vxlan->net, ipv6,
3522 3523
					 vxlan->cfg.dst_port, vxlan->cfg.flags,
					 l3mdev_index);
3524 3525
	if (IS_ERR(vs))
		return PTR_ERR(vs);
3526
#if IS_ENABLED(CONFIG_IPV6)
J
Jiri Benc 已提交
3527
	if (ipv6) {
3528
		rcu_assign_pointer(vxlan->vn6_sock, vs);
J
Jiri Benc 已提交
3529 3530
		node = &vxlan->hlist6;
	} else
3531
#endif
J
Jiri Benc 已提交
3532
	{
3533
		rcu_assign_pointer(vxlan->vn4_sock, vs);
J
Jiri Benc 已提交
3534 3535 3536
		node = &vxlan->hlist4;
	}
	vxlan_vs_add_dev(vs, vxlan, node);
3537
	return 0;
3538 3539
}

3540 3541
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
3542 3543
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3544
	bool ipv4 = !ipv6 || metadata;
3545 3546
	int ret = 0;

3547
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3548
#if IS_ENABLED(CONFIG_IPV6)
3549
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3550
	if (ipv6) {
3551
		ret = __vxlan_sock_add(vxlan, true);
3552 3553 3554
		if (ret < 0 && ret != -EAFNOSUPPORT)
			ipv4 = false;
	}
3555
#endif
3556
	if (ipv4)
3557 3558 3559 3560 3561 3562
		ret = __vxlan_sock_add(vxlan, false);
	if (ret < 0)
		vxlan_sock_release(vxlan);
	return ret;
}

3563 3564
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
3565 3566
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3567
{
3568
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3569
	struct vxlan_dev *tmp;
C
Cong Wang 已提交
3570
	bool use_ipv6 = false;
S
stephen hemminger 已提交
3571

3572 3573 3574 3575 3576 3577 3578 3579
	if (conf->flags & VXLAN_F_GPE) {
		/* For now, allow GPE only together with
		 * COLLECT_METADATA. This can be relaxed later; in such
		 * case, the other side of the PtP link will have to be
		 * provided.
		 */
		if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3580 3581
			NL_SET_ERR_MSG(extack,
				       "VXLAN GPE does not support this combination of attributes");
3582
			return -EINVAL;
3583
		}
J
Jiri Benc 已提交
3584
	}
3585

3586 3587
	if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
		/* Unless IPv6 is explicitly requested, assume IPv4 */
3588
		conf->remote_ip.sa.sa_family = AF_INET;
3589 3590 3591 3592 3593 3594 3595
		conf->saddr.sa.sa_family = AF_INET;
	} else if (!conf->remote_ip.sa.sa_family) {
		conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
	} else if (!conf->saddr.sa.sa_family) {
		conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
	}

3596 3597 3598
	if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
		NL_SET_ERR_MSG(extack,
			       "Local and remote address must be from the same family");
3599
		return -EINVAL;
3600
	}
C
Cong Wang 已提交
3601

3602 3603
	if (vxlan_addr_multicast(&conf->saddr)) {
		NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3604
		return -EINVAL;
3605
	}
3606

3607
	if (conf->saddr.sa.sa_family == AF_INET6) {
3608 3609 3610
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG(extack,
				       "IPv6 support not enabled in the kernel");
3611
			return -EPFNOSUPPORT;
3612
		}
C
Cong Wang 已提交
3613
		use_ipv6 = true;
3614
		conf->flags |= VXLAN_F_IPV6;
3615 3616 3617 3618 3619 3620 3621 3622 3623

		if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			int local_type =
				ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
			int remote_type =
				ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

			if (local_type & IPV6_ADDR_LINKLOCAL) {
				if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3624 3625 3626
				    (remote_type != IPV6_ADDR_ANY)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3627
					return -EINVAL;
3628
				}
3629 3630 3631 3632

				conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
			} else {
				if (remote_type ==
3633 3634 3635
				    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3636
					return -EINVAL;
3637
				}
3638 3639 3640 3641

				conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
			}
		}
3642
	}
S
stephen hemminger 已提交
3643

3644 3645 3646
	if (conf->label && !use_ipv6) {
		NL_SET_ERR_MSG(extack,
			       "Label attribute only applies to IPv6 VXLAN devices");
3647
		return -EINVAL;
3648
	}
3649

3650 3651
	if (conf->remote_ifindex) {
		struct net_device *lowerdev;
3652

3653
		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3654 3655 3656
		if (!lowerdev) {
			NL_SET_ERR_MSG(extack,
				       "Invalid local interface, device not found");
3657
			return -ENODEV;
3658
		}
S
stephen hemminger 已提交
3659

C
Cong Wang 已提交
3660 3661 3662
#if IS_ENABLED(CONFIG_IPV6)
		if (use_ipv6) {
			struct inet6_dev *idev = __in6_dev_get(lowerdev);
3663 3664 3665
			if (idev && idev->cnf.disable_ipv6) {
				NL_SET_ERR_MSG(extack,
					       "IPv6 support disabled by administrator");
C
Cong Wang 已提交
3666
				return -EPERM;
3667
			}
C
Cong Wang 已提交
3668 3669 3670
		}
#endif

3671 3672
		*lower = lowerdev;
	} else {
3673 3674 3675 3676
		if (vxlan_addr_multicast(&conf->remote_ip)) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for multicast remote destination");

3677
			return -EINVAL;
3678
		}
3679

3680
#if IS_ENABLED(CONFIG_IPV6)
3681 3682 3683
		if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for link-local local/remote addresses");
3684
			return -EINVAL;
3685
		}
3686 3687
#endif

3688
		*lower = NULL;
J
Jiri Benc 已提交
3689
	}
S
stephen hemminger 已提交
3690

3691 3692 3693 3694 3695
	if (!conf->dst_port) {
		if (conf->flags & VXLAN_F_GPE)
			conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
		else
			conf->dst_port = htons(vxlan_port);
3696 3697
	}

3698 3699
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;
3700

3701 3702 3703
	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;
3704

3705 3706 3707 3708 3709
		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3710
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3711 3712 3713 3714 3715 3716
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

3717 3718
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
3719
		return -EEXIST;
3720
	}
3721

3722 3723 3724 3725 3726
	return 0;
}

static void vxlan_config_apply(struct net_device *dev,
			       struct vxlan_config *conf,
3727 3728 3729
			       struct net_device *lowerdev,
			       struct net *src_net,
			       bool changelink)
3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	unsigned short needed_headroom = ETH_HLEN;
	bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
	int max_mtu = ETH_MAX_MTU;

	if (!changelink) {
		if (conf->flags & VXLAN_F_GPE)
			vxlan_raw_setup(dev);
		else
			vxlan_ether_setup(dev);

		if (conf->mtu)
			dev->mtu = conf->mtu;
3745 3746

		vxlan->net = src_net;
3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757
	}

	dst->remote_vni = conf->vni;

	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

	if (lowerdev) {
		dst->remote_ifindex = conf->remote_ifindex;

		dev->gso_max_size = lowerdev->gso_max_size;
		dev->gso_max_segs = lowerdev->gso_max_segs;
3758

3759
		needed_headroom = lowerdev->hard_header_len;
3760

3761 3762
		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
					   VXLAN_HEADROOM);
3763 3764 3765 3766 3767
		if (max_mtu < ETH_MIN_MTU)
			max_mtu = ETH_MIN_MTU;

		if (!changelink && !conf->mtu)
			dev->mtu = max_mtu;
3768 3769
	}

3770 3771 3772
	if (dev->mtu > max_mtu)
		dev->mtu = max_mtu;

3773 3774 3775 3776 3777 3778
	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
		needed_headroom += VXLAN6_HEADROOM;
	else
		needed_headroom += VXLAN_HEADROOM;
	dev->needed_headroom = needed_headroom;

3779
	memcpy(&vxlan->cfg, conf, sizeof(*conf));
3780
}
3781

3782
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3783 3784
			       struct vxlan_config *conf, bool changelink,
			       struct netlink_ext_ack *extack)
3785 3786 3787 3788
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct net_device *lowerdev;
	int ret;
3789

3790
	ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3791 3792
	if (ret)
		return ret;
R
Roopa Prabhu 已提交
3793

3794
	vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3795 3796 3797 3798

	return 0;
}

N
Nicolas Dichtel 已提交
3799
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3800 3801
			      struct vxlan_config *conf,
			      struct netlink_ext_ack *extack)
N
Nicolas Dichtel 已提交
3802 3803 3804
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan = netdev_priv(dev);
3805
	struct net_device *remote_dev = NULL;
3806
	struct vxlan_fdb *f = NULL;
3807
	bool unregister = false;
3808
	struct vxlan_rdst *dst;
N
Nicolas Dichtel 已提交
3809 3810
	int err;

3811
	dst = &vxlan->default_dst;
3812
	err = vxlan_dev_configure(net, dev, conf, false, extack);
N
Nicolas Dichtel 已提交
3813 3814 3815 3816 3817 3818
	if (err)
		return err;

	dev->ethtool_ops = &vxlan_ethtool_ops;

	/* create an fdb entry for a valid default destination */
3819
	if (!vxlan_addr_any(&dst->remote_ip)) {
3820
		err = vxlan_fdb_create(vxlan, all_zeros_mac,
3821
				       &dst->remote_ip,
N
Nicolas Dichtel 已提交
3822 3823
				       NUD_REACHABLE | NUD_PERMANENT,
				       vxlan->cfg.dst_port,
3824 3825 3826
				       dst->remote_vni,
				       dst->remote_vni,
				       dst->remote_ifindex,
3827
				       NTF_SELF, 0, &f, extack);
N
Nicolas Dichtel 已提交
3828 3829 3830 3831 3832
		if (err)
			return err;
	}

	err = register_netdevice(dev);
3833 3834
	if (err)
		goto errout;
3835
	unregister = true;
3836

3837 3838 3839 3840 3841 3842 3843 3844 3845 3846
	if (dst->remote_ifindex) {
		remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
		if (!remote_dev)
			goto errout;

		err = netdev_upper_dev_link(remote_dev, dev, extack);
		if (err)
			goto errout;
	}

3847
	err = rtnl_configure_link(dev, NULL);
3848
	if (err)
3849
		goto unlink;
N
Nicolas Dichtel 已提交
3850

3851
	if (f) {
3852
		vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
3853 3854

		/* notify default fdb entry */
3855
		err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
3856
				       RTM_NEWNEIGH, true, extack);
3857 3858
		if (err) {
			vxlan_fdb_destroy(vxlan, f, false, false);
3859 3860
			if (remote_dev)
				netdev_upper_dev_unlink(remote_dev, dev);
3861 3862
			goto unregister;
		}
3863
	}
3864

N
Nicolas Dichtel 已提交
3865
	list_add(&vxlan->next, &vn->vxlan_list);
3866 3867
	if (remote_dev)
		dst->remote_dev = remote_dev;
N
Nicolas Dichtel 已提交
3868
	return 0;
3869 3870 3871
unlink:
	if (remote_dev)
		netdev_upper_dev_unlink(remote_dev, dev);
3872
errout:
3873 3874 3875 3876
	/* unregister_netdevice() destroys the default FDB entry with deletion
	 * notification. But the addition notification was not sent yet, so
	 * destroy the entry by hand here.
	 */
3877
	if (f)
3878 3879
		__vxlan_fdb_free(f);
unregister:
3880 3881
	if (unregister)
		unregister_netdevice(dev);
3882
	return err;
N
Nicolas Dichtel 已提交
3883 3884
}

3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912
/* Set/clear flags based on attribute */
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
			  int attrtype, unsigned long mask, bool changelink,
			  bool changelink_supported,
			  struct netlink_ext_ack *extack)
{
	unsigned long flags;

	if (!tb[attrtype])
		return 0;

	if (changelink && !changelink_supported) {
		vxlan_flag_attr_error(attrtype, extack);
		return -EOPNOTSUPP;
	}

	if (vxlan_policy[attrtype].type == NLA_FLAG)
		flags = conf->flags | mask;
	else if (nla_get_u8(tb[attrtype]))
		flags = conf->flags | mask;
	else
		flags = conf->flags & ~mask;

	conf->flags = flags;

	return 0;
}

R
Roopa Prabhu 已提交
3913 3914
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
			 struct net_device *dev, struct vxlan_config *conf,
3915
			 bool changelink, struct netlink_ext_ack *extack)
3916
{
R
Roopa Prabhu 已提交
3917
	struct vxlan_dev *vxlan = netdev_priv(dev);
3918
	int err = 0;
3919

R
Roopa Prabhu 已提交
3920
	memset(conf, 0, sizeof(*conf));
3921

R
Roopa Prabhu 已提交
3922 3923 3924 3925 3926 3927 3928
	/* if changelink operation, start with old existing cfg */
	if (changelink)
		memcpy(conf, &vxlan->cfg, sizeof(*conf));

	if (data[IFLA_VXLAN_ID]) {
		__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

3929 3930
		if (changelink && (vni != conf->vni)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
R
Roopa Prabhu 已提交
3931
			return -EOPNOTSUPP;
3932
		}
R
Roopa Prabhu 已提交
3933 3934
		conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
	}
3935 3936

	if (data[IFLA_VXLAN_GROUP]) {
3937 3938
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
3939
			return -EOPNOTSUPP;
3940
		}
3941

R
Roopa Prabhu 已提交
3942
		conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3943
		conf->remote_ip.sa.sa_family = AF_INET;
3944
	} else if (data[IFLA_VXLAN_GROUP6]) {
3945 3946
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
3947
			return -EPFNOSUPPORT;
3948
		}
3949

3950 3951
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
3952
			return -EOPNOTSUPP;
3953
		}
3954

R
Roopa Prabhu 已提交
3955 3956
		conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
		conf->remote_ip.sa.sa_family = AF_INET6;
3957 3958 3959
	}

	if (data[IFLA_VXLAN_LOCAL]) {
3960 3961
		if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
3962
			return -EOPNOTSUPP;
3963
		}
3964

R
Roopa Prabhu 已提交
3965 3966
		conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
		conf->saddr.sa.sa_family = AF_INET;
3967
	} else if (data[IFLA_VXLAN_LOCAL6]) {
3968 3969
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
3970
			return -EPFNOSUPPORT;
3971
		}
3972

3973 3974
		if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
3975
			return -EOPNOTSUPP;
3976
		}
3977

3978
		/* TODO: respect scope id */
R
Roopa Prabhu 已提交
3979 3980
		conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
		conf->saddr.sa.sa_family = AF_INET6;
3981 3982 3983
	}

	if (data[IFLA_VXLAN_LINK])
R
Roopa Prabhu 已提交
3984
		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
3985

S
stephen hemminger 已提交
3986
	if (data[IFLA_VXLAN_TOS])
R
Roopa Prabhu 已提交
3987
		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
S
stephen hemminger 已提交
3988

3989
	if (data[IFLA_VXLAN_TTL])
R
Roopa Prabhu 已提交
3990
		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
3991

H
Hangbin Liu 已提交
3992
	if (data[IFLA_VXLAN_TTL_INHERIT]) {
3993 3994 3995 3996 3997 3998
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
				    VXLAN_F_TTL_INHERIT, changelink, false,
				    extack);
		if (err)
			return err;

H
Hangbin Liu 已提交
3999 4000
	}

4001
	if (data[IFLA_VXLAN_LABEL])
R
Roopa Prabhu 已提交
4002
		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
4003 4004
			     IPV6_FLOWLABEL_MASK;

R
Roopa Prabhu 已提交
4005
	if (data[IFLA_VXLAN_LEARNING]) {
4006 4007 4008 4009 4010
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
				    VXLAN_F_LEARN, changelink, true,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4011 4012 4013 4014
	} else if (!changelink) {
		/* default to learn on a new device */
		conf->flags |= VXLAN_F_LEARN;
	}
S
stephen hemminger 已提交
4015

I
Ido Schimmel 已提交
4016
	if (data[IFLA_VXLAN_AGEING])
R
Roopa Prabhu 已提交
4017
		conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
S
stephen hemminger 已提交
4018

R
Roopa Prabhu 已提交
4019
	if (data[IFLA_VXLAN_PROXY]) {
4020 4021 4022 4023 4024
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
				    VXLAN_F_PROXY, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4025
	}
D
David Stevens 已提交
4026

R
Roopa Prabhu 已提交
4027
	if (data[IFLA_VXLAN_RSC]) {
4028 4029 4030 4031 4032
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
				    VXLAN_F_RSC, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4033
	}
D
David Stevens 已提交
4034

R
Roopa Prabhu 已提交
4035
	if (data[IFLA_VXLAN_L2MISS]) {
4036 4037 4038 4039 4040
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
				    VXLAN_F_L2MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4041
	}
D
David Stevens 已提交
4042

R
Roopa Prabhu 已提交
4043
	if (data[IFLA_VXLAN_L3MISS]) {
4044 4045 4046 4047 4048
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
				    VXLAN_F_L3MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4049
	}
D
David Stevens 已提交
4050

R
Roopa Prabhu 已提交
4051
	if (data[IFLA_VXLAN_LIMIT]) {
4052 4053 4054
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
					    "Cannot change limit");
R
Roopa Prabhu 已提交
4055
			return -EOPNOTSUPP;
4056
		}
R
Roopa Prabhu 已提交
4057 4058
		conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
	}
S
stephen hemminger 已提交
4059

R
Roopa Prabhu 已提交
4060
	if (data[IFLA_VXLAN_COLLECT_METADATA]) {
4061 4062 4063 4064 4065
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
				    VXLAN_F_COLLECT_METADATA, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4066
	}
4067

4068
	if (data[IFLA_VXLAN_PORT_RANGE]) {
R
Roopa Prabhu 已提交
4069 4070 4071 4072 4073 4074
		if (!changelink) {
			const struct ifla_vxlan_port_range *p
				= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
			conf->port_min = ntohs(p->low);
			conf->port_max = ntohs(p->high);
		} else {
4075 4076
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
					    "Cannot change port range");
R
Roopa Prabhu 已提交
4077 4078
			return -EOPNOTSUPP;
		}
4079 4080
	}

R
Roopa Prabhu 已提交
4081
	if (data[IFLA_VXLAN_PORT]) {
4082 4083 4084
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
					    "Cannot change port");
R
Roopa Prabhu 已提交
4085
			return -EOPNOTSUPP;
4086
		}
R
Roopa Prabhu 已提交
4087 4088
		conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
	}
4089

R
Roopa Prabhu 已提交
4090
	if (data[IFLA_VXLAN_UDP_CSUM]) {
4091 4092 4093
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
					    "Cannot change UDP_CSUM flag");
R
Roopa Prabhu 已提交
4094
			return -EOPNOTSUPP;
4095
		}
R
Roopa Prabhu 已提交
4096 4097 4098
		if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
	}
4099

R
Roopa Prabhu 已提交
4100
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
4101 4102 4103 4104 4105
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4106
	}
4107

R
Roopa Prabhu 已提交
4108
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
4109 4110 4111 4112 4113
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
				    VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4114
	}
4115

R
Roopa Prabhu 已提交
4116
	if (data[IFLA_VXLAN_REMCSUM_TX]) {
4117 4118 4119 4120 4121
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
				    VXLAN_F_REMCSUM_TX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4122
	}
T
Tom Herbert 已提交
4123

R
Roopa Prabhu 已提交
4124
	if (data[IFLA_VXLAN_REMCSUM_RX]) {
4125 4126 4127 4128 4129
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
				    VXLAN_F_REMCSUM_RX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4130 4131 4132
	}

	if (data[IFLA_VXLAN_GBP]) {
4133 4134 4135 4136
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
				    VXLAN_F_GBP, changelink, false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4137 4138 4139
	}

	if (data[IFLA_VXLAN_GPE]) {
4140 4141 4142 4143 4144
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
				    VXLAN_F_GPE, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4145 4146 4147
	}

	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
4148 4149 4150 4151 4152
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
				    VXLAN_F_REMCSUM_NOPARTIAL, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4153 4154 4155
	}

	if (tb[IFLA_MTU]) {
4156 4157 4158
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "Cannot change mtu");
R
Roopa Prabhu 已提交
4159
			return -EOPNOTSUPP;
4160
		}
R
Roopa Prabhu 已提交
4161 4162 4163
		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
	}

4164 4165 4166
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

R
Roopa Prabhu 已提交
4167 4168 4169 4170
	return 0;
}

static int vxlan_newlink(struct net *src_net, struct net_device *dev,
4171 4172
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4173 4174 4175 4176
{
	struct vxlan_config conf;
	int err;

4177
	err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
R
Roopa Prabhu 已提交
4178 4179 4180
	if (err)
		return err;

4181
	return __vxlan_dev_create(src_net, dev, &conf, extack);
R
Roopa Prabhu 已提交
4182
}
T
Tom Herbert 已提交
4183

R
Roopa Prabhu 已提交
4184
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
4185 4186
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4187 4188
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4189
	struct net_device *lowerdev;
R
Roopa Prabhu 已提交
4190
	struct vxlan_config conf;
4191
	struct vxlan_rdst *dst;
R
Roopa Prabhu 已提交
4192 4193
	int err;

4194
	dst = &vxlan->default_dst;
4195
	err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
R
Roopa Prabhu 已提交
4196 4197
	if (err)
		return err;
T
Thomas Graf 已提交
4198

4199 4200
	err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
				    vxlan, extack);
R
Roopa Prabhu 已提交
4201 4202
	if (err)
		return err;
4203

4204 4205 4206
	if (dst->remote_dev == lowerdev)
		lowerdev = NULL;

4207 4208 4209 4210 4211
	err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
					     extack);
	if (err)
		return err;

R
Roopa Prabhu 已提交
4212
	/* handle default dst entry */
4213
	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
4214 4215 4216
		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);

		spin_lock_bh(&vxlan->hash_lock[hash_index]);
4217
		if (!vxlan_addr_any(&conf.remote_ip)) {
4218
			err = vxlan_fdb_update(vxlan, all_zeros_mac,
4219
					       &conf.remote_ip,
R
Roopa Prabhu 已提交
4220
					       NUD_REACHABLE | NUD_PERMANENT,
4221
					       NLM_F_APPEND | NLM_F_CREATE,
R
Roopa Prabhu 已提交
4222
					       vxlan->cfg.dst_port,
4223 4224
					       conf.vni, conf.vni,
					       conf.remote_ifindex,
4225
					       NTF_SELF, 0, true, extack);
R
Roopa Prabhu 已提交
4226
			if (err) {
4227
				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4228 4229
				netdev_adjacent_change_abort(dst->remote_dev,
							     lowerdev, dev);
R
Roopa Prabhu 已提交
4230 4231 4232
				return err;
			}
		}
4233 4234 4235 4236 4237 4238 4239 4240
		if (!vxlan_addr_any(&dst->remote_ip))
			__vxlan_fdb_delete(vxlan, all_zeros_mac,
					   dst->remote_ip,
					   vxlan->cfg.dst_port,
					   dst->remote_vni,
					   dst->remote_vni,
					   dst->remote_ifindex,
					   true);
4241
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
R
Roopa Prabhu 已提交
4242
	}
4243

4244 4245 4246
	if (conf.age_interval != vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies);

4247
	netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
4248
	if (lowerdev && lowerdev != dst->remote_dev) {
4249
		dst->remote_dev = lowerdev;
4250 4251
		netdev_update_lockdep_key(lowerdev);
	}
4252
	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
R
Roopa Prabhu 已提交
4253
	return 0;
S
stephen hemminger 已提交
4254 4255 4256 4257 4258 4259
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

4260 4261
	vxlan_flush(vxlan, true);

4262
	list_del(&vxlan->next);
S
stephen hemminger 已提交
4263
	unregister_netdevice_queue(dev, head);
4264 4265
	if (vxlan->default_dst.remote_dev)
		netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
S
stephen hemminger 已提交
4266 4267 4268 4269 4270 4271
}

static size_t vxlan_get_size(const struct net_device *dev)
{

	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
C
Cong Wang 已提交
4272
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
S
stephen hemminger 已提交
4273
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
C
Cong Wang 已提交
4274
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
S
stephen hemminger 已提交
4275
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
H
Hangbin Liu 已提交
4276
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
S
stephen hemminger 已提交
4277
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
4278
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
4279
		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
S
stephen hemminger 已提交
4280
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
D
David Stevens 已提交
4281 4282 4283 4284
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
4285
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
S
stephen hemminger 已提交
4286 4287
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
4288
		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
4289 4290 4291 4292
		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
T
Tom Herbert 已提交
4293 4294
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
S
stephen hemminger 已提交
4295 4296 4297 4298 4299 4300
		0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	const struct vxlan_dev *vxlan = netdev_priv(dev);
4301
	const struct vxlan_rdst *dst = &vxlan->default_dst;
4302
	struct ifla_vxlan_port_range ports = {
4303 4304
		.low =  htons(vxlan->cfg.port_min),
		.high = htons(vxlan->cfg.port_max),
4305
	};
S
stephen hemminger 已提交
4306

4307
	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
S
stephen hemminger 已提交
4308 4309
		goto nla_put_failure;

C
Cong Wang 已提交
4310 4311
	if (!vxlan_addr_any(&dst->remote_ip)) {
		if (dst->remote_ip.sa.sa_family == AF_INET) {
4312 4313
			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
					    dst->remote_ip.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4314 4315 4316
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4317 4318
			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
					     &dst->remote_ip.sin6.sin6_addr))
C
Cong Wang 已提交
4319 4320 4321 4322
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4323

4324
	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
S
stephen hemminger 已提交
4325 4326
		goto nla_put_failure;

4327 4328
	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
4329
			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
4330
					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4331 4332 4333
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4334
			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
4335
					     &vxlan->cfg.saddr.sin6.sin6_addr))
C
Cong Wang 已提交
4336 4337 4338 4339
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4340

4341
	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
H
Hangbin Liu 已提交
4342 4343
	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
4344
	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
4345
	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
4346
	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
D
David Stevens 已提交
4347
	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
4348
		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
D
David Stevens 已提交
4349
	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
4350
		       !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
4351 4352
	    nla_put_u8(skb, IFLA_VXLAN_RSC,
		       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
D
David Stevens 已提交
4353
	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
4354
		       !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
D
David Stevens 已提交
4355
	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
4356
		       !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
4357
	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
4358
		       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
4359 4360 4361
	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
4362
	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
4363
		       !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
4364
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
4365
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
4366
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
4367
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
T
Tom Herbert 已提交
4368
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
4369
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
T
Tom Herbert 已提交
4370
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
4371
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
S
stephen hemminger 已提交
4372 4373
		goto nla_put_failure;

4374 4375 4376
	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
		goto nla_put_failure;

4377
	if (vxlan->cfg.flags & VXLAN_F_GBP &&
T
Thomas Graf 已提交
4378 4379 4380
	    nla_put_flag(skb, IFLA_VXLAN_GBP))
		goto nla_put_failure;

4381
	if (vxlan->cfg.flags & VXLAN_F_GPE &&
J
Jiri Benc 已提交
4382 4383 4384
	    nla_put_flag(skb, IFLA_VXLAN_GPE))
		goto nla_put_failure;

4385
	if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
4386 4387 4388
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

S
stephen hemminger 已提交
4389 4390 4391 4392 4393 4394
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4395 4396 4397 4398 4399 4400 4401
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	return vxlan->net;
}

S
stephen hemminger 已提交
4402 4403 4404 4405 4406 4407 4408 4409
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
	.kind		= "vxlan",
	.maxtype	= IFLA_VXLAN_MAX,
	.policy		= vxlan_policy,
	.priv_size	= sizeof(struct vxlan_dev),
	.setup		= vxlan_setup,
	.validate	= vxlan_validate,
	.newlink	= vxlan_newlink,
R
Roopa Prabhu 已提交
4410
	.changelink	= vxlan_changelink,
S
stephen hemminger 已提交
4411 4412 4413
	.dellink	= vxlan_dellink,
	.get_size	= vxlan_get_size,
	.fill_info	= vxlan_fill_info,
4414
	.get_link_net	= vxlan_get_link_net,
S
stephen hemminger 已提交
4415 4416
};

4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427
struct net_device *vxlan_dev_create(struct net *net, const char *name,
				    u8 name_assign_type,
				    struct vxlan_config *conf)
{
	struct nlattr *tb[IFLA_MAX + 1];
	struct net_device *dev;
	int err;

	memset(&tb, 0, sizeof(tb));

	dev = rtnl_create_link(net, name, name_assign_type,
4428
			       &vxlan_link_ops, tb, NULL);
4429 4430 4431
	if (IS_ERR(dev))
		return dev;

4432
	err = __vxlan_dev_create(net, dev, conf, NULL);
4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450
	if (err < 0) {
		free_netdev(dev);
		return ERR_PTR(err);
	}

	err = rtnl_configure_link(dev, NULL);
	if (err < 0) {
		LIST_HEAD(list_kill);

		vxlan_dellink(dev, &list_kill);
		unregister_netdevice_many(&list_kill);
		return ERR_PTR(err);
	}

	return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
					     struct net_device *dev)
{
	struct vxlan_dev *vxlan, *next;
	LIST_HEAD(list_kill);

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		struct vxlan_rdst *dst = &vxlan->default_dst;

		/* In case we created vxlan device with carrier
		 * and we loose the carrier due to module unload
		 * we also need to remove vxlan device. In other
		 * cases, it's not necessary and remote_ifindex
		 * is 0 here, so no matches.
		 */
		if (dst->remote_ifindex == dev->ifindex)
			vxlan_dellink(vxlan->dev, &list_kill);
	}

	unregister_netdevice_many(&list_kill);
}

4473 4474
static int vxlan_netdevice_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
4475 4476
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4477
	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4478

4479 4480
	if (event == NETDEV_UNREGISTER) {
		vxlan_offload_rx_ports(dev, false);
4481
		vxlan_handle_lowerdev_unregister(vn, dev);
4482 4483 4484 4485
	} else if (event == NETDEV_REGISTER) {
		vxlan_offload_rx_ports(dev, true);
	} else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
		   event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4486
		vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4487
	}
4488 4489 4490 4491 4492

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
4493
	.notifier_call = vxlan_netdevice_event,
4494 4495
};

4496 4497 4498 4499 4500 4501 4502
static void
vxlan_fdb_offloaded_set(struct net_device *dev,
			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
4503 4504 4505
	u32 hash_index;

	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4506

4507
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		goto out;

	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
				   fdb_info->remote_port,
				   fdb_info->remote_vni,
				   fdb_info->remote_ifindex);
	if (!rdst)
		goto out;

	rdst->offloaded = fdb_info->offloaded;

out:
4523
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4524 4525
}

P
Petr Machata 已提交
4526 4527 4528 4529 4530
static int
vxlan_fdb_external_learn_add(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4531
	struct netlink_ext_ack *extack;
4532
	u32 hash_index;
P
Petr Machata 已提交
4533 4534
	int err;

4535
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4536 4537
	extack = switchdev_notifier_info_to_extack(&fdb_info->info);

4538
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4539 4540 4541 4542 4543 4544 4545 4546
	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
			       NUD_REACHABLE,
			       NLM_F_CREATE | NLM_F_REPLACE,
			       fdb_info->remote_port,
			       fdb_info->vni,
			       fdb_info->remote_vni,
			       fdb_info->remote_ifindex,
			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4547
			       0, false, extack);
4548
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4549 4550 4551 4552 4553 4554 4555 4556 4557 4558

	return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
4559
	u32 hash_index;
P
Petr Machata 已提交
4560 4561
	int err = 0;

4562 4563
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		err = -ENOENT;
	else if (f->flags & NTF_EXT_LEARNED)
		err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
					 fdb_info->remote_ip,
					 fdb_info->remote_port,
					 fdb_info->vni,
					 fdb_info->remote_vni,
					 fdb_info->remote_ifindex,
					 false);

4577
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4578 4579 4580 4581

	return err;
}

4582 4583 4584 4585
static int vxlan_switchdev_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
{
	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
P
Petr Machata 已提交
4586 4587
	struct switchdev_notifier_vxlan_fdb_info *fdb_info;
	int err = 0;
4588 4589 4590 4591 4592

	switch (event) {
	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
		vxlan_fdb_offloaded_set(dev, ptr);
		break;
P
Petr Machata 已提交
4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612
	case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_add(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = true;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
	case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_del(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = false;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
4613 4614
	}

P
Petr Machata 已提交
4615
	return err;
4616 4617 4618 4619 4620 4621
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
	.notifier_call = vxlan_switchdev_event,
};

4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641
static void vxlan_fdb_nh_flush(struct nexthop *nh)
{
	struct vxlan_fdb *fdb;
	struct vxlan_dev *vxlan;
	u32 hash_index;

	rcu_read_lock();
	list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) {
		vxlan = rcu_dereference(fdb->vdev);
		WARN_ON(!vxlan);
		hash_index = fdb_head_index(vxlan, fdb->eth_addr,
					    vxlan->default_dst.remote_vni);
		spin_lock_bh(&vxlan->hash_lock[hash_index]);
		if (!hlist_unhashed(&fdb->hlist))
			vxlan_fdb_destroy(vxlan, fdb, false, false);
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
	}
	rcu_read_unlock();
}

4642 4643 4644 4645 4646 4647 4648 4649
static int vxlan_nexthop_event(struct notifier_block *nb,
			       unsigned long event, void *ptr)
{
	struct nexthop *nh = ptr;

	if (!nh || event != NEXTHOP_EVENT_DEL)
		return NOTIFY_DONE;

4650
	vxlan_fdb_nh_flush(nh);
4651 4652 4653 4654 4655 4656 4657 4658

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_nexthop_notifier_block __read_mostly = {
	.notifier_call = vxlan_nexthop_event,
};

S
stephen hemminger 已提交
4659 4660 4661
static __net_init int vxlan_init_net(struct net *net)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4662
	unsigned int h;
S
stephen hemminger 已提交
4663

4664
	INIT_LIST_HEAD(&vn->vxlan_list);
4665
	spin_lock_init(&vn->sock_lock);
S
stephen hemminger 已提交
4666

4667 4668
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vn->sock_list[h]);
S
stephen hemminger 已提交
4669

4670
	return register_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
S
stephen hemminger 已提交
4671 4672
}

4673
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
N
Nicolas Dichtel 已提交
4674 4675 4676 4677
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan, *next;
	struct net_device *dev, *aux;
4678
	unsigned int h;
N
Nicolas Dichtel 已提交
4679 4680 4681

	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &vxlan_link_ops)
4682
			unregister_netdevice_queue(dev, head);
N
Nicolas Dichtel 已提交
4683 4684 4685 4686 4687

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		/* If vxlan->dev is in the same netns, it has already been added
		 * to the list by the previous loop.
		 */
4688
		if (!net_eq(dev_net(vxlan->dev), net))
4689
			unregister_netdevice_queue(vxlan->dev, head);
N
Nicolas Dichtel 已提交
4690 4691
	}

4692 4693
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
N
Nicolas Dichtel 已提交
4694 4695
}

4696 4697 4698 4699 4700 4701
static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
{
	struct net *net;
	LIST_HEAD(list);

	rtnl_lock();
4702 4703
	list_for_each_entry(net, net_list, exit_list)
		unregister_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
4704 4705 4706 4707 4708 4709 4710
	list_for_each_entry(net, net_list, exit_list)
		vxlan_destroy_tunnels(net, &list);

	unregister_netdevice_many(&list);
	rtnl_unlock();
}

S
stephen hemminger 已提交
4711 4712
static struct pernet_operations vxlan_net_ops = {
	.init = vxlan_init_net,
4713
	.exit_batch = vxlan_exit_batch_net,
S
stephen hemminger 已提交
4714 4715 4716 4717 4718 4719 4720 4721 4722 4723
	.id   = &vxlan_net_id,
	.size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
	int rc;

	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

4724
	rc = register_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4725 4726 4727
	if (rc)
		goto out1;

4728
	rc = register_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4729 4730 4731
	if (rc)
		goto out2;

4732
	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4733 4734
	if (rc)
		goto out3;
S
stephen hemminger 已提交
4735

4736 4737 4738 4739
	rc = rtnl_link_register(&vxlan_link_ops);
	if (rc)
		goto out4;

4740
	return 0;
4741 4742
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4743 4744
out3:
	unregister_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4745
out2:
4746
	unregister_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4747 4748 4749
out1:
	return rc;
}
4750
late_initcall(vxlan_init_module);
S
stephen hemminger 已提交
4751 4752 4753

static void __exit vxlan_cleanup_module(void)
{
4754
	rtnl_link_unregister(&vxlan_link_ops);
4755
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4756
	unregister_netdevice_notifier(&vxlan_notifier_block);
4757 4758
	unregister_pernet_subsys(&vxlan_net_ops);
	/* rcu_barrier() is called by netns */
S
stephen hemminger 已提交
4759 4760 4761 4762 4763
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
4764
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
J
Jesse Brandeburg 已提交
4765
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
S
stephen hemminger 已提交
4766
MODULE_ALIAS_RTNL_LINK("vxlan");