vxlan.c 118.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
S
stephen hemminger 已提交
2
/*
R
Rami Rosen 已提交
3
 * VXLAN: Virtual eXtensible Local Area Network
S
stephen hemminger 已提交
4
 *
5
 * Copyright (c) 2012-2013 Vyatta Inc.
S
stephen hemminger 已提交
6 7 8 9 10 11 12 13 14 15 16
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Y
Yan Burman 已提交
17
#include <linux/ethtool.h>
D
David Stevens 已提交
18 19
#include <net/arp.h>
#include <net/ndisc.h>
20
#include <net/ipv6_stubs.h>
S
stephen hemminger 已提交
21 22 23 24 25 26
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
27
#include <net/tun_proto.h>
28
#include <net/vxlan.h>
29
#include <net/nexthop.h>
30

C
Cong Wang 已提交
31 32
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
33
#include <net/ip6_checksum.h>
C
Cong Wang 已提交
34
#endif
S
stephen hemminger 已提交
35 36 37

#define VXLAN_VERSION	"0.1"

38 39
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
S
stephen hemminger 已提交
40 41 42
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

43 44
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
S
Stephen Hemminger 已提交
45
 * for compatibility with early adopters.
46
 */
47 48
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
S
stephen hemminger 已提交
49 50 51 52 53 54
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

55
static unsigned int vxlan_net_id;
56
static struct rtnl_link_ops vxlan_link_ops;
57

58
static const u8 all_zeros_mac[ETH_ALEN + 2];
59

60
static int vxlan_sock_add(struct vxlan_dev *vxlan);
61

62 63
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

64 65 66 67
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
68
	spinlock_t	  sock_lock;
69 70
};

S
stephen hemminger 已提交
71 72 73 74 75 76
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
77
	struct list_head  remotes;
78
	u8		  eth_addr[ETH_ALEN];
S
stephen hemminger 已提交
79
	u16		  state;	/* see ndm_state */
80
	__be32		  vni;
P
Petr Machata 已提交
81
	u16		  flags;	/* see ndm_flags and below */
82 83
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
S
stephen hemminger 已提交
84 85
};

P
Petr Machata 已提交
86 87
#define NTF_VXLAN_ADDED_BY_USER 0x100

S
stephen hemminger 已提交
88 89 90
/* salt for hash table */
static u32 vxlan_salt __read_mostly;

T
Thomas Graf 已提交
91 92
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
93 94
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
T
Thomas Graf 已提交
95 96
}

C
Cong Wang 已提交
97 98 99 100
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
101 102 103 104 105 106
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
107 108 109 110
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
111
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
112
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
J
Jiri Benc 已提交
113 114 115
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
116
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
117 118 119 120 121
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
122 123 124
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
125
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
126
{
J
Jiri Benc 已提交
127
	if (ip->sa.sa_family == AF_INET6)
128
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
J
Jiri Benc 已提交
129
	else
130
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
131 132 133 134 135 136 137
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
138
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
139 140 141 142
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
143 144 145
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
146
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
147 148 149 150 151
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
152 153 154
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
155
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
156
{
157
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
158 159 160
}
#endif

161
/* Virtual Network hash table head */
162
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
163
{
164
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
165 166 167 168
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
S
stephen hemminger 已提交
169 170 171
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

172 173 174
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

175 176 177
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
178
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
179
{
180 181
	if (rcu_access_pointer(fdb->nh))
		return NULL;
182 183 184 185 186
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
187 188
	if (rcu_access_pointer(fdb->nh))
		return NULL;
189
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
190 191
}

192 193 194 195
/* Find VXLAN socket based on network namespace, address family and UDP port
 * and enabled unshareable flags.
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
196
					  __be16 port, u32 flags, int ifindex)
197 198
{
	struct vxlan_sock *vs;
199 200

	flags &= VXLAN_F_RCV_FLAGS;
201 202

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
203
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
204
		    vxlan_get_sk_family(vs) == family &&
205 206
		    vs->flags == flags &&
		    vs->sock->sk->sk_bound_dev_if == ifindex)
207 208 209
			return vs;
	}
	return NULL;
S
stephen hemminger 已提交
210 211
}

212 213
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
S
stephen hemminger 已提交
214
{
J
Jiri Benc 已提交
215
	struct vxlan_dev_node *node;
S
stephen hemminger 已提交
216

217 218 219 220
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

J
Jiri Benc 已提交
221 222
	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
223 224 225
			continue;

		if (IS_ENABLED(CONFIG_IPV6)) {
J
Jiri Benc 已提交
226
			const struct vxlan_config *cfg = &node->vxlan->cfg;
227 228 229 230 231 232

			if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
			    cfg->remote_ifindex != ifindex)
				continue;
		}

J
Jiri Benc 已提交
233
		return node->vxlan;
S
stephen hemminger 已提交
234 235 236 237 238
	}

	return NULL;
}

P
Pravin B Shelar 已提交
239
/* Look up VNI in a per net namespace table */
240 241 242
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
					__be32 vni, sa_family_t family,
					__be16 port, u32 flags)
P
Pravin B Shelar 已提交
243 244 245
{
	struct vxlan_sock *vs;

246
	vs = vxlan_find_sock(net, family, port, flags, ifindex);
P
Pravin B Shelar 已提交
247 248 249
	if (!vs)
		return NULL;

250
	return vxlan_vs_find_vni(vs, ifindex, vni);
P
Pravin B Shelar 已提交
251 252
}

S
stephen hemminger 已提交
253 254
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
S
Stephen Hemminger 已提交
255 256 257
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
S
stephen hemminger 已提交
258 259 260
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
261
	bool send_ip, send_eth;
S
stephen hemminger 已提交
262
	struct nlmsghdr *nlh;
263
	struct nexthop *nh;
S
stephen hemminger 已提交
264 265 266 267 268 269 270 271
	struct ndmsg *ndm;

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
D
David Stevens 已提交
272 273 274

	send_eth = send_ip = true;

275
	nh = rcu_dereference_rtnl(fdb->nh);
D
David Stevens 已提交
276
	if (type == RTM_GETNEIGH) {
277 278 279 280 281 282
		if (rdst) {
			send_ip = !vxlan_addr_any(&rdst->remote_ip);
			ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
		} else if (nh) {
			ndm->ndm_family = nexthop_get_family(nh);
		}
D
David Stevens 已提交
283 284 285
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
	} else
		ndm->ndm_family	= AF_BRIDGE;
S
stephen hemminger 已提交
286 287
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
288
	ndm->ndm_flags = fdb->flags;
289
	if (rdst && rdst->offloaded)
290
		ndm->ndm_flags |= NTF_OFFLOADED;
291
	ndm->ndm_type = RTN_UNICAST;
S
stephen hemminger 已提交
292

293
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
294
	    nla_put_s32(skb, NDA_LINK_NETNSID,
295
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
296 297
		goto nla_put_failure;

D
David Stevens 已提交
298
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
S
stephen hemminger 已提交
299
		goto nla_put_failure;
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
	if (nh) {
		if (nla_put_u32(skb, NDA_NH_ID, nh->id))
			goto nla_put_failure;
	} else if (rdst) {
		if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
						  &rdst->remote_ip))
			goto nla_put_failure;

		if (rdst->remote_port &&
		    rdst->remote_port != vxlan->cfg.dst_port &&
		    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
			goto nla_put_failure;
		if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
		    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
			goto nla_put_failure;
		if (rdst->remote_ifindex &&
		    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
			goto nla_put_failure;
	}
S
stephen hemminger 已提交
319

320
	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
321 322 323
	    nla_put_u32(skb, NDA_SRC_VNI,
			be32_to_cpu(fdb->vni)))
		goto nla_put_failure;
S
stephen hemminger 已提交
324 325 326 327 328 329 330 331 332

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

333 334
	nlmsg_end(skb, nlh);
	return 0;
S
stephen hemminger 已提交
335 336 337 338 339 340 341 342 343 344

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
C
Cong Wang 已提交
345
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
346
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
347 348
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
349
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
S
stephen hemminger 已提交
350 351 352
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

P
Petr Machata 已提交
353 354
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       struct vxlan_rdst *rd, int type)
S
stephen hemminger 已提交
355 356 357 358 359 360 361 362 363
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

364
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
S
stephen hemminger 已提交
365 366 367 368 369 370 371 372 373 374 375 376 377 378
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

379 380 381
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
			    const struct vxlan_fdb *fdb,
			    const struct vxlan_rdst *rd,
382
			    struct netlink_ext_ack *extack,
383 384 385
			    struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	fdb_info->info.dev = vxlan->dev;
386
	fdb_info->info.extack = extack;
387 388 389 390 391 392 393 394 395 396
	fdb_info->remote_ip = rd->remote_ip;
	fdb_info->remote_port = rd->remote_port;
	fdb_info->remote_vni = rd->remote_vni;
	fdb_info->remote_ifindex = rd->remote_ifindex;
	memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
	fdb_info->vni = fdb->vni;
	fdb_info->offloaded = rd->offloaded;
	fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

397 398 399
static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
					      struct vxlan_fdb *fdb,
					      struct vxlan_rdst *rd,
400 401
					      bool adding,
					      struct netlink_ext_ack *extack)
P
Petr Machata 已提交
402 403 404
{
	struct switchdev_notifier_vxlan_fdb_info info;
	enum switchdev_notifier_type notifier_type;
405
	int ret;
P
Petr Machata 已提交
406 407

	if (WARN_ON(!rd))
408
		return 0;
P
Petr Machata 已提交
409 410 411

	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
412
	vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
413
	ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
414
				       &info.info, extack);
415
	return notifier_to_errno(ret);
P
Petr Machata 已提交
416 417
}

418
static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
419 420
			    struct vxlan_rdst *rd, int type, bool swdev_notify,
			    struct netlink_ext_ack *extack)
P
Petr Machata 已提交
421
{
422 423
	int err;

424
	if (swdev_notify && rd) {
425 426
		switch (type) {
		case RTM_NEWNEIGH:
427
			err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
428
								 true, extack);
429 430
			if (err)
				return err;
431 432 433
			break;
		case RTM_DELNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
434
							   false, extack);
435 436
			break;
		}
P
Petr Machata 已提交
437 438 439
	}

	__vxlan_fdb_notify(vxlan, fdb, rd, type);
440
	return 0;
P
Petr Machata 已提交
441 442
}

C
Cong Wang 已提交
443
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
D
David Stevens 已提交
444 445
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
446 447 448 449
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
C
Cong Wang 已提交
450
		.remote_ip = *ipa, /* goes to NDA_DST */
451
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
452
	};
453

454
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
455 456 457 458
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
459 460 461
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
462
	struct vxlan_rdst remote = { };
D
David Stevens 已提交
463 464 465

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

466
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
467 468
}

S
stephen hemminger 已提交
469 470 471 472 473 474 475 476
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
477 478
#else
	value <<= 16;
S
stephen hemminger 已提交
479 480 481 482
#endif
	return hash_64(value, FDB_HASH_BITS);
}

483 484 485 486 487 488 489 490
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));

	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

491 492 493 494 495 496 497 498
static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
		return eth_vni_hash(mac, vni);
	else
		return eth_hash(mac);
}

S
stephen hemminger 已提交
499 500
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
501
						const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
502
{
503
	return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
S
stephen hemminger 已提交
504 505 506
}

/* Look up Ethernet address in forwarding table */
507
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
508
					  const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
509
{
510
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
S
stephen hemminger 已提交
511 512
	struct vxlan_fdb *f;

513
	hlist_for_each_entry_rcu(f, head, hlist) {
514
		if (ether_addr_equal(mac, f->eth_addr)) {
515
			if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
516 517 518 519 520 521
				if (vni == f->vni)
					return f;
			} else {
				return f;
			}
		}
S
stephen hemminger 已提交
522 523 524 525 526
	}

	return NULL;
}

527
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
528
					const u8 *mac, __be32 vni)
529 530 531
{
	struct vxlan_fdb *f;

532
	f = __vxlan_find_mac(vxlan, mac, vni);
533
	if (f && f->used != jiffies)
534 535 536 537 538
		f->used = jiffies;

	return f;
}

539 540
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
C
Cong Wang 已提交
541
					      union vxlan_addr *ip, __be16 port,
542
					      __be32 vni, __u32 ifindex)
543
{
544
	struct vxlan_rdst *rd;
545

546
	list_for_each_entry(rd, &f->remotes, list) {
C
Cong Wang 已提交
547
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
548 549 550
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
551
			return rd;
552
	}
553

554 555 556
	return NULL;
}

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	u8 eth_addr[ETH_ALEN + 2] = { 0 };
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	int rc = 0;

	if (is_multicast_ether_addr(mac) ||
	    is_zero_ether_addr(mac))
		return -EINVAL;

	ether_addr_copy(eth_addr, mac);

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, eth_addr, vni);
	if (!f) {
		rc = -ENOENT;
		goto out;
	}

	rdst = first_remote_rcu(f);
581
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
582 583 584 585 586 587 588

out:
	rcu_read_unlock();
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

P
Petr Machata 已提交
589 590 591
static int vxlan_fdb_notify_one(struct notifier_block *nb,
				const struct vxlan_dev *vxlan,
				const struct vxlan_fdb *f,
592 593
				const struct vxlan_rdst *rdst,
				struct netlink_ext_ack *extack)
P
Petr Machata 已提交
594 595 596 597
{
	struct switchdev_notifier_vxlan_fdb_info fdb_info;
	int rc;

598
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
P
Petr Machata 已提交
599 600 601 602 603 604
	rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
			       &fdb_info);
	return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
605 606
		     struct notifier_block *nb,
		     struct netlink_ext_ack *extack)
P
Petr Machata 已提交
607 608 609 610 611 612 613 614 615 616 617 618
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;
	int rc = 0;

	if (!netif_is_vxlan(dev))
		return -EINVAL;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
619
		spin_lock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
620 621 622 623
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
			if (f->vni == vni) {
				list_for_each_entry(rdst, &f->remotes, list) {
					rc = vxlan_fdb_notify_one(nb, vxlan,
624 625
								  f, rdst,
								  extack);
P
Petr Machata 已提交
626
					if (rc)
627
						goto unlock;
P
Petr Machata 已提交
628 629 630
				}
			}
		}
631
		spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
632
	}
633
	return 0;
P
Petr Machata 已提交
634

635 636
unlock:
	spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
637 638 639 640
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

641 642 643 644 645 646 647 648 649 650 651 652
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;

	if (!netif_is_vxlan(dev))
		return;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
653
		spin_lock_bh(&vxlan->hash_lock[h]);
654 655 656 657
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
			if (f->vni == vni)
				list_for_each_entry(rdst, &f->remotes, list)
					rdst->offloaded = false;
658
		spin_unlock_bh(&vxlan->hash_lock[h]);
659
	}
660

661 662 663
}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

664 665
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
666
			     union vxlan_addr *ip, __be16 port, __be32 vni,
667
			     __u32 ifindex, struct vxlan_rdst *oldrd)
668 669 670 671 672 673 674 675 676 677
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
678

679
	*oldrd = *rd;
680
	dst_cache_reset(&rd->dst_cache);
C
Cong Wang 已提交
681
	rd->remote_ip = *ip;
682 683 684
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
685
	rd->offloaded = false;
686 687 688
	return 1;
}

689 690
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
691
			    union vxlan_addr *ip, __be16 port, __be32 vni,
692
			    __u32 ifindex, struct vxlan_rdst **rdp)
693 694 695 696 697 698 699
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

700 701 702
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
703 704 705 706 707 708

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

C
Cong Wang 已提交
709
	rd->remote_ip = *ip;
710
	rd->remote_port = port;
711
	rd->offloaded = false;
712 713
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
714 715 716

	list_add_tail_rcu(&rd->list, &f->remotes);

717
	*rdp = rd;
718 719 720
	return 1;
}

T
Tom Herbert 已提交
721 722 723
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
724 725
					  __be32 vni_field,
					  struct gro_remcsum *grc,
726
					  bool nopartial)
T
Tom Herbert 已提交
727
{
728
	size_t start, offset;
T
Tom Herbert 已提交
729 730

	if (skb->remcsum_offload)
731
		return vh;
T
Tom Herbert 已提交
732 733 734 735

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

736 737
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
T
Tom Herbert 已提交
738

739 740
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
T
Tom Herbert 已提交
741 742 743 744 745 746

	skb->remcsum_offload = 1;

	return vh;
}

747 748 749
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
					 struct list_head *head,
					 struct sk_buff *skb)
750
{
751 752
	struct sk_buff *pp = NULL;
	struct sk_buff *p;
753
	struct vxlanhdr *vh, *vh2;
754
	unsigned int hlen, off_vx;
755
	int flush = 1;
756
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
757
	__be32 flags;
758 759 760
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
761 762 763 764 765 766 767 768 769 770

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

T
Tom Herbert 已提交
771 772
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

773
	flags = vh->vx_flags;
T
Tom Herbert 已提交
774 775 776

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
777
				       vh->vx_vni, &grc,
778 779
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
T
Tom Herbert 已提交
780 781 782 783 784

		if (!vh)
			goto out;
	}

785 786
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

787
	list_for_each_entry(p, head, list) {
788 789 790 791
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
T
Thomas Graf 已提交
792 793
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
794 795 796 797 798
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

S
Sabrina Dubroca 已提交
799
	pp = call_gro_receive(eth_gro_receive, head, skb);
800
	flush = 0;
801 802

out:
803
	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
804 805 806 807

	return pp;
}

808
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
809
{
810 811 812
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
813
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
814 815
}

816
static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state,
P
Petr Machata 已提交
817
					 __be32 src_vni, __u16 ndm_flags)
818 819 820 821 822 823 824 825 826 827
{
	struct vxlan_fdb *f;

	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	if (!f)
		return NULL;
	f->state = state;
	f->flags = ndm_flags;
	f->updated = f->used = jiffies;
	f->vni = src_vni;
828 829
	f->nh = NULL;
	INIT_LIST_HEAD(&f->nh_list);
830 831 832 833 834 835
	INIT_LIST_HEAD(&f->remotes);
	memcpy(f->eth_addr, mac, ETH_ALEN);

	return f;
}

836 837 838 839 840 841 842 843
static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
			     __be32 src_vni, struct vxlan_fdb *f)
{
	++vxlan->addrcnt;
	hlist_add_head_rcu(&f->hlist,
			   vxlan_fdb_head(vxlan, mac, src_vni));
}

844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909
static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       u32 nhid, struct netlink_ext_ack *extack)
{
	struct nexthop *old_nh = rtnl_dereference(fdb->nh);
	struct nh_group *nhg;
	struct nexthop *nh;
	int err = -EINVAL;

	if (old_nh && old_nh->id == nhid)
		return 0;

	nh = nexthop_find_by_id(vxlan->net, nhid);
	if (!nh) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		goto err_inval;
	}

	if (nh) {
		if (!nexthop_get(nh)) {
			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
			nh = NULL;
			goto err_inval;
		}
		if (!nh->is_fdb_nh) {
			NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
			goto err_inval;
		}

		if (!nh->is_group || !nh->nh_grp->mpath) {
			NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
			goto err_inval;
		}

		/* check nexthop group family */
		nhg = rtnl_dereference(nh->nh_grp);
		switch (vxlan->default_dst.remote_ip.sa.sa_family) {
		case AF_INET:
			if (!nhg->has_v4) {
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
			break;
		case AF_INET6:
			if (nhg->has_v4) {
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
		}
	}

	if (old_nh) {
		list_del_rcu(&fdb->nh_list);
		nexthop_put(old_nh);
	}
	rcu_assign_pointer(fdb->nh, nh);
	list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
	return 1;

err_inval:
	if (nh)
		nexthop_put(nh);
	return err;
}

S
stephen hemminger 已提交
910
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
911 912
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __be16 port, __be32 src_vni,
P
Petr Machata 已提交
913
			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
914 915
			    u32 nhid, struct vxlan_fdb **fdb,
			    struct netlink_ext_ack *extack)
916 917 918 919 920 921 922 923 924 925
{
	struct vxlan_rdst *rd = NULL;
	struct vxlan_fdb *f;
	int rc;

	if (vxlan->cfg.addrmax &&
	    vxlan->addrcnt >= vxlan->cfg.addrmax)
		return -ENOSPC;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
926
	f = vxlan_fdb_alloc(mac, state, src_vni, ndm_flags);
927 928 929
	if (!f)
		return -ENOMEM;

930 931 932 933 934 935
	if (nhid)
		rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
	else
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
	if (rc < 0)
		goto errout;
936 937 938 939

	*fdb = f;

	return 0;
940 941 942 943

errout:
	kfree(f);
	return rc;
944 945
}

946
static void __vxlan_fdb_free(struct vxlan_fdb *f)
947 948
{
	struct vxlan_rdst *rd, *nd;
949 950 951 952 953 954 955 956
	struct nexthop *nh;

	nh = rcu_dereference_raw(f->nh);
	if (nh) {
		rcu_assign_pointer(f->nh, NULL);
		list_del_rcu(&f->nh_list);
		nexthop_put(nh);
	}
957 958 959 960 961 962 963 964

	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
		kfree(rd);
	}
	kfree(f);
}

965 966 967 968 969 970 971
static void vxlan_fdb_free(struct rcu_head *head)
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);

	__vxlan_fdb_free(f);
}

972 973 974 975 976 977 978 979
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
			      bool do_notify, bool swdev_notify)
{
	struct vxlan_rdst *rd;

	netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
980 981 982
	if (do_notify) {
		if (rcu_access_pointer(f->nh))
			vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
983
					 swdev_notify, NULL);
984 985 986 987 988
		else
			list_for_each_entry(rd, &f->remotes, list)
				vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
						 swdev_notify, NULL);
	}
989 990 991 992 993

	hlist_del_rcu(&f->hlist);
	call_rcu(&f->rcu, vxlan_fdb_free);
}

994 995 996 997 998 999 1000 1001
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

1002 1003 1004 1005 1006
static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
				     union vxlan_addr *ip,
				     __u16 state, __u16 flags,
				     __be16 port, __be32 vni,
				     __u32 ifindex, __u16 ndm_flags,
1007
				     struct vxlan_fdb *f, u32 nhid,
1008 1009
				     bool swdev_notify,
				     struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
1010
{
P
Petr Machata 已提交
1011
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
1012
	struct vxlan_rdst *rd = NULL;
1013
	struct vxlan_rdst oldrd;
S
stephen hemminger 已提交
1014
	int notify = 0;
1015 1016
	int rc = 0;
	int err;
S
stephen hemminger 已提交
1017

1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
	if (nhid && !rcu_access_pointer(f->nh)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot replace an existing non nexthop fdb with a nexthop");
		return -EOPNOTSUPP;
	}

	if (nhid && (flags & NLM_F_APPEND)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot append to a nexthop fdb");
		return -EOPNOTSUPP;
	}

1030 1031 1032 1033 1034 1035 1036 1037 1038
	/* Do not allow an externally learned entry to take over an entry added
	 * by the user.
	 */
	if (!(fdb_flags & NTF_EXT_LEARNED) ||
	    !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
		if (f->state != state) {
			f->state = state;
			f->updated = jiffies;
			notify = 1;
1039
		}
1040 1041 1042 1043
		if (f->flags != fdb_flags) {
			f->flags = fdb_flags;
			f->updated = jiffies;
			notify = 1;
1044
		}
1045
	}
1046

1047 1048 1049 1050
	if ((flags & NLM_F_REPLACE)) {
		/* Only change unicasts */
		if (!(is_multicast_ether_addr(f->eth_addr) ||
		      is_zero_ether_addr(f->eth_addr))) {
1051 1052 1053 1054 1055 1056 1057 1058
			if (nhid) {
				rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
				if (rc < 0)
					return rc;
			} else {
				rc = vxlan_fdb_replace(f, ip, port, vni,
						       ifindex, &oldrd);
			}
1059
			notify |= rc;
1060
		} else {
1061
			NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
1062
			return -EOPNOTSUPP;
1063 1064 1065 1066 1067 1068
		}
	}
	if ((flags & NLM_F_APPEND) &&
	    (is_multicast_ether_addr(f->eth_addr) ||
	     is_zero_ether_addr(f->eth_addr))) {
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
1069

1070
		if (rc < 0)
1071
			return rc;
1072
		notify |= rc;
S
stephen hemminger 已提交
1073 1074
	}

1075 1076 1077
	if (ndm_flags & NTF_USE)
		f->used = jiffies;

1078 1079 1080
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
1081

1082
		err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
1083
				       swdev_notify, extack);
1084 1085
		if (err)
			goto err_notify;
1086
	}
S
stephen hemminger 已提交
1087 1088

	return 0;
1089 1090

err_notify:
1091 1092
	if (nhid)
		return err;
1093 1094
	if ((flags & NLM_F_REPLACE) && rc)
		*rd = oldrd;
1095
	else if ((flags & NLM_F_APPEND) && rc) {
1096
		list_del_rcu(&rd->list);
1097 1098
		call_rcu(&rd->rcu, vxlan_dst_free);
	}
1099
	return err;
S
stephen hemminger 已提交
1100 1101
}

1102 1103 1104 1105
static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
				   const u8 *mac, union vxlan_addr *ip,
				   __u16 state, __u16 flags,
				   __be16 port, __be32 src_vni, __be32 vni,
1106
				   __u32 ifindex, __u16 ndm_flags, u32 nhid,
1107 1108
				   bool swdev_notify,
				   struct netlink_ext_ack *extack)
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120
{
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
	struct vxlan_fdb *f;
	int rc;

	/* Disallow replace to add a multicast entry */
	if ((flags & NLM_F_REPLACE) &&
	    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
		return -EOPNOTSUPP;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
	rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
1121
			      vni, ifindex, fdb_flags, nhid, &f, extack);
1122 1123 1124
	if (rc < 0)
		return rc;

1125
	vxlan_fdb_insert(vxlan, mac, src_vni, f);
1126
	rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
1127
			      swdev_notify, extack);
1128 1129 1130
	if (rc)
		goto err_notify;

1131
	return 0;
1132 1133 1134 1135

err_notify:
	vxlan_fdb_destroy(vxlan, f, false, false);
	return rc;
1136 1137 1138 1139 1140 1141 1142
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __u16 flags,
			    __be16 port, __be32 src_vni, __be32 vni,
1143
			    __u32 ifindex, __u16 ndm_flags, u32 nhid,
1144 1145
			    bool swdev_notify,
			    struct netlink_ext_ack *extack)
1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158
{
	struct vxlan_fdb *f;

	f = __vxlan_find_mac(vxlan, mac, src_vni);
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}

		return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
						 vni, ifindex, ndm_flags, f,
1159
						 nhid, swdev_notify, extack);
1160 1161 1162 1163 1164 1165
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

		return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
					       port, src_vni, vni, ifindex,
1166 1167
					       ndm_flags, nhid, swdev_notify,
					       extack);
1168 1169 1170
	}
}

1171
static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
1172
				  struct vxlan_rdst *rd, bool swdev_notify)
1173 1174
{
	list_del_rcu(&rd->list);
1175
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
1176 1177 1178
	call_rcu(&rd->rcu, vxlan_dst_free);
}

M
Mike Rapoport 已提交
1179
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
1180
			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
1181
			   __be32 *vni, u32 *ifindex, u32 *nhid)
S
stephen hemminger 已提交
1182
{
1183
	struct net *net = dev_net(vxlan->dev);
C
Cong Wang 已提交
1184
	int err;
S
stephen hemminger 已提交
1185

M
Mike Rapoport 已提交
1186
	if (tb[NDA_DST]) {
C
Cong Wang 已提交
1187 1188 1189
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
M
Mike Rapoport 已提交
1190
	} else {
C
Cong Wang 已提交
1191
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
1192

C
Cong Wang 已提交
1193 1194 1195 1196 1197 1198 1199 1200 1201
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
M
Mike Rapoport 已提交
1202
	}
S
stephen hemminger 已提交
1203

1204
	if (tb[NDA_PORT]) {
1205
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
1206
			return -EINVAL;
M
Mike Rapoport 已提交
1207 1208
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
1209
		*port = vxlan->cfg.dst_port;
M
Mike Rapoport 已提交
1210
	}
1211 1212 1213 1214

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
1215
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
M
Mike Rapoport 已提交
1216 1217 1218
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
1219

1220 1221 1222 1223 1224 1225 1226 1227
	if (tb[NDA_SRC_VNI]) {
		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
			return -EINVAL;
		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
	} else {
		*src_vni = vxlan->default_dst.remote_vni;
	}

1228
	if (tb[NDA_IFINDEX]) {
P
Pravin B Shelar 已提交
1229
		struct net_device *tdev;
1230 1231 1232

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
M
Mike Rapoport 已提交
1233
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1234
		tdev = __dev_get_by_index(net, *ifindex);
P
Pravin B Shelar 已提交
1235
		if (!tdev)
1236
			return -EADDRNOTAVAIL;
M
Mike Rapoport 已提交
1237 1238 1239 1240
	} else {
		*ifindex = 0;
	}

1241 1242 1243 1244 1245
	if (tb[NDA_NH_ID])
		*nhid = nla_get_u32(tb[NDA_NH_ID]);
	else
		*nhid = 0;

M
Mike Rapoport 已提交
1246 1247 1248 1249 1250 1251
	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
1252 1253
			 const unsigned char *addr, u16 vid, u16 flags,
			 struct netlink_ext_ack *extack)
M
Mike Rapoport 已提交
1254 1255 1256
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
C
Cong Wang 已提交
1257
	union vxlan_addr ip;
M
Mike Rapoport 已提交
1258
	__be16 port;
1259
	__be32 src_vni, vni;
1260
	u32 ifindex, nhid;
1261
	u32 hash_index;
M
Mike Rapoport 已提交
1262 1263 1264 1265 1266 1267 1268 1269
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

1270
	if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
M
Mike Rapoport 已提交
1271 1272
		return -EINVAL;

1273 1274
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
M
Mike Rapoport 已提交
1275 1276
	if (err)
		return err;
1277

1278 1279 1280
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

1281 1282
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1283
	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
P
Petr Machata 已提交
1284 1285
			       port, src_vni, vni, ifindex,
			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1286
			       nhid, true, extack);
1287
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1288 1289 1290 1291

	return err;
}

1292 1293
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
			      const unsigned char *addr, union vxlan_addr ip,
1294
			      __be16 port, __be32 src_vni, __be32 vni,
1295
			      u32 ifindex, bool swdev_notify)
S
stephen hemminger 已提交
1296
{
1297
	struct vxlan_rdst *rd = NULL;
1298
	struct vxlan_fdb *f;
1299
	int err = -ENOENT;
1300

1301
	f = vxlan_find_mac(vxlan, addr, src_vni);
1302
	if (!f)
1303
		return err;
1304

C
Cong Wang 已提交
1305 1306
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1307 1308 1309 1310 1311 1312 1313 1314
		if (!rd)
			goto out;
	}

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
1315
		vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1316
		goto out;
S
stephen hemminger 已提交
1317
	}
1318

1319
	vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1320 1321

out:
1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
	return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
			    const unsigned char *addr, u16 vid)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	union vxlan_addr ip;
	__be32 src_vni, vni;
1333
	u32 ifindex, nhid;
1334
	u32 hash_index;
1335
	__be16 port;
1336 1337
	int err;

1338 1339
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
1340 1341 1342
	if (err)
		return err;

1343 1344
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1345 1346
	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
				 true);
1347
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1348 1349 1350 1351 1352 1353

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1354
			  struct net_device *dev,
1355
			  struct net_device *filter_dev, int *idx)
S
stephen hemminger 已提交
1356 1357 1358
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
1359
	int err = 0;
S
stephen hemminger 已提交
1360 1361 1362 1363

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

1364
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1365 1366
			struct vxlan_rdst *rd;

1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377
			if (rcu_access_pointer(f->nh)) {
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, NULL);
				if (err < 0)
					goto out;
				continue;
			}

1378
			list_for_each_entry_rcu(rd, &f->remotes, list) {
1379
				if (*idx < cb->args[2])
1380 1381
					goto skip;

1382 1383 1384 1385 1386
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
1387
				if (err < 0)
1388 1389
					goto out;
skip:
1390
				*idx += 1;
1391
			}
S
stephen hemminger 已提交
1392 1393
		}
	}
1394
out:
1395
	return err;
S
stephen hemminger 已提交
1396 1397
}

R
Roopa Prabhu 已提交
1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
static int vxlan_fdb_get(struct sk_buff *skb,
			 struct nlattr *tb[],
			 struct net_device *dev,
			 const unsigned char *addr,
			 u16 vid, u32 portid, u32 seq,
			 struct netlink_ext_ack *extack)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	__be32 vni;
	int err;

	if (tb[NDA_VNI])
		vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
	else
		vni = vxlan->default_dst.remote_vni;

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, addr, vni);
	if (!f) {
		NL_SET_ERR_MSG(extack, "Fdb entry not found");
		err = -ENOENT;
		goto errout;
	}

	err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
			     RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
	rcu_read_unlock();
	return err;
}

S
stephen hemminger 已提交
1431 1432
/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
1433
 * Return true if packet is bogus and should be dropped.
S
stephen hemminger 已提交
1434
 */
1435
static bool vxlan_snoop(struct net_device *dev,
1436
			union vxlan_addr *src_ip, const u8 *src_mac,
1437
			u32 src_ifindex, __be32 vni)
S
stephen hemminger 已提交
1438 1439 1440
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
1441 1442 1443 1444 1445 1446 1447
	u32 ifindex = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (src_ip->sa.sa_family == AF_INET6 &&
	    (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
		ifindex = src_ifindex;
#endif
S
stephen hemminger 已提交
1448

1449
	f = vxlan_find_mac(vxlan, src_mac, vni);
S
stephen hemminger 已提交
1450
	if (likely(f)) {
1451
		struct vxlan_rdst *rdst = first_remote_rcu(f);
1452

1453 1454
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
			   rdst->remote_ifindex == ifindex))
1455 1456 1457
			return false;

		/* Don't migrate static entries, drop packets */
1458
		if (f->state & (NUD_PERMANENT | NUD_NOARP))
1459
			return true;
S
stephen hemminger 已提交
1460

1461 1462 1463 1464
		/* Don't override an fdb with nexthop with a learnt entry */
		if (rcu_access_pointer(f->nh))
			return true;

S
stephen hemminger 已提交
1465 1466
		if (net_ratelimit())
			netdev_info(dev,
C
Cong Wang 已提交
1467
				    "%pM migrated from %pIS to %pIS\n",
1468
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
S
stephen hemminger 已提交
1469

C
Cong Wang 已提交
1470
		rdst->remote_ip = *src_ip;
S
stephen hemminger 已提交
1471
		f->updated = jiffies;
1472
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
S
stephen hemminger 已提交
1473
	} else {
1474 1475
		u32 hash_index = fdb_head_index(vxlan, src_mac, vni);

S
stephen hemminger 已提交
1476
		/* learned new entry */
1477
		spin_lock(&vxlan->hash_lock[hash_index]);
1478 1479 1480

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
1481
			vxlan_fdb_update(vxlan, src_mac, src_ip,
1482 1483
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
1484
					 vxlan->cfg.dst_port,
1485
					 vni,
1486
					 vxlan->default_dst.remote_vni,
1487
					 ifindex, NTF_SELF, 0, true, NULL);
1488
		spin_unlock(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1489
	}
1490 1491

	return false;
S
stephen hemminger 已提交
1492 1493 1494
}

/* See if multicast group is already in use by other ID */
1495
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
S
stephen hemminger 已提交
1496
{
1497
	struct vxlan_dev *vxlan;
1498
	struct vxlan_sock *sock4;
A
Arnd Bergmann 已提交
1499 1500 1501
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
1502
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
S
stephen hemminger 已提交
1503

1504 1505
	sock4 = rtnl_dereference(dev->vn4_sock);

1506 1507 1508
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
1509
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1510
		return false;
1511
#if IS_ENABLED(CONFIG_IPV6)
1512
	sock6 = rtnl_dereference(dev->vn6_sock);
1513
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1514 1515
		return false;
#endif
1516

1517
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1518
		if (!netif_running(vxlan->dev) || vxlan == dev)
1519
			continue;
S
stephen hemminger 已提交
1520

1521 1522
		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
1523
			continue;
1524
#if IS_ENABLED(CONFIG_IPV6)
1525 1526
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
1527 1528
			continue;
#endif
1529 1530 1531 1532 1533 1534 1535 1536 1537 1538

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
1539
	}
S
stephen hemminger 已提交
1540 1541 1542 1543

	return false;
}

1544
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1545
{
1546
	struct vxlan_net *vn;
1547

1548
	if (!vs)
1549
		return false;
1550
	if (!refcount_dec_and_test(&vs->refcnt))
1551
		return false;
S
stephen hemminger 已提交
1552

1553
	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1554
	spin_lock(&vn->sock_lock);
1555
	hlist_del_rcu(&vs->hlist);
1556
	udp_tunnel_notify_del_rx_port(vs->sock,
1557 1558
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
1559
				      UDP_TUNNEL_TYPE_VXLAN);
1560 1561
	spin_unlock(&vn->sock_lock);

1562
	return true;
S
stephen hemminger 已提交
1563 1564
}

1565 1566
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
1567
	struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1568
#if IS_ENABLED(CONFIG_IPV6)
1569 1570
	struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

1571
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1572 1573
#endif

1574
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1575 1576
	synchronize_net();

1577 1578
	vxlan_vs_del_dev(vxlan);

1579 1580 1581
	if (__vxlan_sock_release_prep(sock4)) {
		udp_tunnel_sock_release(sock4->sock);
		kfree(sock4);
1582 1583 1584
	}

#if IS_ENABLED(CONFIG_IPV6)
1585 1586 1587
	if (__vxlan_sock_release_prep(sock6)) {
		udp_tunnel_sock_release(sock6->sock);
		kfree(sock6);
1588
	}
1589 1590 1591
#endif
}

1592
/* Update multicast group membership when first VNI on
1593
 * multicast address is brought up
1594
 */
1595
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1596
{
1597
	struct sock *sk;
C
Cong Wang 已提交
1598 1599
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1600
	int ret = -EINVAL;
S
stephen hemminger 已提交
1601

C
Cong Wang 已提交
1602
	if (ip->sa.sa_family == AF_INET) {
1603
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1604 1605 1606 1607 1608
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1609
		sk = sock4->sock->sk;
1610
		lock_sock(sk);
1611
		ret = ip_mc_join_group(sk, &mreq);
1612
		release_sock(sk);
C
Cong Wang 已提交
1613 1614
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1615 1616 1617
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1618
		lock_sock(sk);
1619 1620
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
1621
		release_sock(sk);
C
Cong Wang 已提交
1622 1623
#endif
	}
S
stephen hemminger 已提交
1624

1625
	return ret;
S
stephen hemminger 已提交
1626 1627 1628
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
1629
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1630
{
1631
	struct sock *sk;
C
Cong Wang 已提交
1632 1633
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1634
	int ret = -EINVAL;
S
stephen hemminger 已提交
1635

C
Cong Wang 已提交
1636
	if (ip->sa.sa_family == AF_INET) {
1637
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1638 1639 1640 1641 1642
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1643
		sk = sock4->sock->sk;
1644
		lock_sock(sk);
1645
		ret = ip_mc_leave_group(sk, &mreq);
1646
		release_sock(sk);
C
Cong Wang 已提交
1647 1648
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1649 1650 1651
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1652
		lock_sock(sk);
1653 1654
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
1655
		release_sock(sk);
C
Cong Wang 已提交
1656 1657
#endif
	}
S
stephen hemminger 已提交
1658

1659
	return ret;
S
stephen hemminger 已提交
1660 1661
}

1662 1663
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
T
Tom Herbert 已提交
1664
{
1665
	size_t start, offset;
T
Tom Herbert 已提交
1666

1667 1668
	if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
		goto out;
1669

1670 1671
	start = vxlan_rco_start(unparsed->vx_vni);
	offset = start + vxlan_rco_offset(unparsed->vx_vni);
T
Tom Herbert 已提交
1672

1673
	if (!pskb_may_pull(skb, offset + sizeof(u16)))
J
Jiri Benc 已提交
1674
		return false;
T
Tom Herbert 已提交
1675

J
Jiri Benc 已提交
1676 1677
	skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
			    !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1678 1679 1680
out:
	unparsed->vx_flags &= ~VXLAN_HF_RCO;
	unparsed->vx_vni &= VXLAN_VNI_MASK;
J
Jiri Benc 已提交
1681
	return true;
T
Tom Herbert 已提交
1682 1683
}

1684
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1685
				struct sk_buff *skb, u32 vxflags,
1686
				struct vxlan_metadata *md)
1687
{
1688
	struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1689
	struct metadata_dst *tun_dst;
1690 1691 1692

	if (!(unparsed->vx_flags & VXLAN_HF_GBP))
		goto out;
1693 1694 1695

	md->gbp = ntohs(gbp->policy_id);

1696
	tun_dst = (struct metadata_dst *)skb_dst(skb);
1697
	if (tun_dst) {
1698
		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1699 1700
		tun_dst->u.tun_info.options_len = sizeof(*md);
	}
1701 1702 1703 1704 1705
	if (gbp->dont_learn)
		md->gbp |= VXLAN_GBP_DONT_LEARN;

	if (gbp->policy_applied)
		md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1706

1707 1708 1709
	/* In flow-based mode, GBP is carried in dst_metadata */
	if (!(vxflags & VXLAN_F_COLLECT_METADATA))
		skb->mark = md->gbp;
1710 1711
out:
	unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1712 1713
}

J
Jiri Benc 已提交
1714
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
J
Jiri Benc 已提交
1715
				__be16 *protocol,
J
Jiri Benc 已提交
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734
				struct sk_buff *skb, u32 vxflags)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;

	/* Need to have Next Protocol set for interfaces in GPE mode. */
	if (!gpe->np_applied)
		return false;
	/* "The initial version is 0. If a receiver does not support the
	 * version indicated it MUST drop the packet.
	 */
	if (gpe->version != 0)
		return false;
	/* "When the O bit is set to 1, the packet is an OAM packet and OAM
	 * processing MUST occur." However, we don't implement OAM
	 * processing, thus drop the packet.
	 */
	if (gpe->oam_flag)
		return false;

1735 1736
	*protocol = tun_p_to_eth_p(gpe->next_protocol);
	if (!*protocol)
J
Jiri Benc 已提交
1737 1738 1739 1740 1741 1742
		return false;

	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
	return true;
}

1743 1744
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
			  struct vxlan_sock *vs,
1745
			  struct sk_buff *skb, __be32 vni)
1746 1747
{
	union vxlan_addr saddr;
1748
	u32 ifindex = skb->dev->ifindex;
1749 1750 1751 1752 1753 1754 1755

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, vxlan->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1756
		return false;
1757

1758
	/* Get address from the outer IP header */
1759
	if (vxlan_get_sk_family(vs) == AF_INET) {
1760
		saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1761 1762 1763
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1764
		saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1765 1766 1767 1768
		saddr.sa.sa_family = AF_INET6;
#endif
	}

1769
	if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1770
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1771 1772 1773 1774 1775
		return false;

	return true;
}

1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
				  struct sk_buff *skb)
{
	int err = 0;

	if (vxlan_get_sk_family(vs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
	else
		err = IP6_ECN_decapsulate(oiph, skb);
#endif

	if (unlikely(err) && log_ecn_error) {
		if (vxlan_get_sk_family(vs) == AF_INET)
			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
					     &((struct iphdr *)oiph)->saddr,
					     ((struct iphdr *)oiph)->tos);
		else
			net_info_ratelimited("non-ECT from %pI6\n",
					     &((struct ipv6hdr *)oiph)->saddr);
	}
	return err <= 1;
}

S
stephen hemminger 已提交
1800
/* Callback from net/ipv4/udp.c to receive packets */
1801
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
S
stephen hemminger 已提交
1802
{
1803
	struct pcpu_sw_netstats *stats;
1804
	struct vxlan_dev *vxlan;
P
Pravin B Shelar 已提交
1805
	struct vxlan_sock *vs;
1806
	struct vxlanhdr unparsed;
T
Thomas Graf 已提交
1807 1808
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
J
Jiri Benc 已提交
1809
	__be16 protocol = htons(ETH_P_TEB);
J
Jiri Benc 已提交
1810
	bool raw_proto = false;
1811
	void *oiph;
1812
	__be32 vni = 0;
S
stephen hemminger 已提交
1813

J
Jiri Benc 已提交
1814
	/* Need UDP and VXLAN header to be present */
1815
	if (!pskb_may_pull(skb, VXLAN_HLEN))
1816
		goto drop;
S
stephen hemminger 已提交
1817

1818
	unparsed = *vxlan_hdr(skb);
J
Jiri Benc 已提交
1819 1820 1821 1822 1823 1824
	/* VNI flag always required to be set */
	if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
			   ntohl(vxlan_hdr(skb)->vx_flags),
			   ntohl(vxlan_hdr(skb)->vx_vni));
		/* Return non vxlan pkt */
1825
		goto drop;
S
stephen hemminger 已提交
1826
	}
J
Jiri Benc 已提交
1827 1828
	unparsed.vx_flags &= ~VXLAN_HF_VNI;
	unparsed.vx_vni &= ~VXLAN_VNI_MASK;
S
stephen hemminger 已提交
1829

1830
	vs = rcu_dereference_sk_user_data(sk);
P
Pravin B Shelar 已提交
1831
	if (!vs)
S
stephen hemminger 已提交
1832 1833
		goto drop;

1834 1835
	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

1836
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1837 1838 1839
	if (!vxlan)
		goto drop;

J
Jiri Benc 已提交
1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850
	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if (vs->flags & VXLAN_F_GPE) {
		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
			goto drop;
		raw_proto = true;
	}

	if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
				   !net_eq(vxlan->net, dev_net(vxlan->dev))))
1851
		goto drop;
1852

T
Thomas Graf 已提交
1853
	if (vxlan_collect_metadata(vs)) {
1854
		struct metadata_dst *tun_dst;
J
Jiri Benc 已提交
1855

1856
		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1857
					 key32_to_tunnel_id(vni), sizeof(*md));
1858

T
Thomas Graf 已提交
1859 1860 1861
		if (!tun_dst)
			goto drop;

1862
		md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1863 1864

		skb_dst_set(skb, (struct dst_entry *)tun_dst);
T
Thomas Graf 已提交
1865 1866 1867 1868
	} else {
		memset(md, 0, sizeof(*md));
	}

1869 1870 1871 1872
	if (vs->flags & VXLAN_F_REMCSUM_RX)
		if (!vxlan_remcsum(&unparsed, skb, vs->flags))
			goto drop;
	if (vs->flags & VXLAN_F_GBP)
1873
		vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
J
Jiri Benc 已提交
1874 1875 1876
	/* Note that GBP and GPE can never be active together. This is
	 * ensured in vxlan_dev_configure.
	 */
T
Thomas Graf 已提交
1877

1878
	if (unparsed.vx_flags || unparsed.vx_vni) {
1879 1880 1881 1882
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
1883
		 * maintains compatibility with previous stack code, and also
1884 1885 1886
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */
J
Jiri Benc 已提交
1887
		goto drop;
1888 1889
	}

J
Jiri Benc 已提交
1890
	if (!raw_proto) {
1891
		if (!vxlan_set_mac(vxlan, vs, skb, vni))
J
Jiri Benc 已提交
1892 1893
			goto drop;
	} else {
1894
		skb_reset_mac_header(skb);
J
Jiri Benc 已提交
1895 1896 1897
		skb->dev = vxlan->dev;
		skb->pkt_type = PACKET_HOST;
	}
1898 1899 1900 1901 1902 1903 1904 1905 1906 1907

	oiph = skb_network_header(skb);
	skb_reset_network_header(skb);

	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		goto drop;
	}

1908 1909 1910 1911 1912 1913 1914 1915
	rcu_read_lock();

	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
		rcu_read_unlock();
		atomic_long_inc(&vxlan->dev->rx_dropped);
		goto drop;
	}

1916 1917 1918 1919 1920 1921 1922
	stats = this_cpu_ptr(vxlan->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

	gro_cells_receive(&vxlan->gro_cells, skb);
1923 1924 1925

	rcu_read_unlock();

P
Pravin B Shelar 已提交
1926 1927 1928
	return 0;

drop:
J
Jiri Benc 已提交
1929 1930 1931
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
P
Pravin B Shelar 已提交
1932 1933
}

S
Stefano Brivio 已提交
1934 1935 1936 1937 1938 1939 1940 1941
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr *hdr;
	__be32 vni;

1942
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
S
Stefano Brivio 已提交
1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961
		return -EINVAL;

	hdr = vxlan_hdr(skb);

	if (!(hdr->vx_flags & VXLAN_HF_VNI))
		return -EINVAL;

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	if (!vxlan)
		return -ENOENT;

	return 0;
}

1962
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
D
David Stevens 已提交
1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;
	struct neighbour *n;

	if (dev->flags & IFF_NOARP)
		goto out;

	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
		dev->stats.tx_dropped++;
		goto out;
	}
	parp = arp_hdr(skb);

	if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
	     parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
	    parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		goto out;
	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		goto out;

	n = neigh_lookup(&arp_tbl, &tip, dev);

	if (n) {
		struct vxlan_fdb *f;
		struct sk_buff	*reply;

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2009
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2010
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
D
David Stevens 已提交
2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

		reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
				n->ha, sha);

		neigh_release(n);

2021 2022 2023
		if (reply == NULL)
			goto out;

D
David Stevens 已提交
2024 2025 2026 2027 2028 2029 2030
		skb_reset_mac_header(reply);
		__skb_pull(reply, skb_network_offset(reply));
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;
2031
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
C
Cong Wang 已提交
2032 2033
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
2034
			.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2035 2036 2037 2038
		};

		vxlan_ip_miss(dev, &ipa);
	}
D
David Stevens 已提交
2039 2040 2041 2042 2043
out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}

C
Cong Wang 已提交
2044
#if IS_ENABLED(CONFIG_IPV6)
2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
	struct neighbour *n, bool isrouter)
{
	struct net_device *dev = request->dev;
	struct sk_buff *reply;
	struct nd_msg *ns, *na;
	struct ipv6hdr *pip6;
	u8 *daddr;
	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
	int ns_olen;
	int i, len;

2057
	if (dev == NULL || !pskb_may_pull(request, request->len))
2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069
		return NULL;

	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
		sizeof(*na) + na_olen + dev->needed_tailroom;
	reply = alloc_skb(len, GFP_ATOMIC);
	if (reply == NULL)
		return NULL;

	reply->protocol = htons(ETH_P_IPV6);
	reply->dev = dev;
	skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
	skb_push(reply, sizeof(struct ethhdr));
2070
	skb_reset_mac_header(reply);
2071

2072
	ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
2073 2074

	daddr = eth_hdr(request)->h_source;
2075 2076
	ns_olen = request->len - skb_network_offset(request) -
		sizeof(struct ipv6hdr) - sizeof(*ns);
2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090
	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
			break;
		}
	}

	/* Ethernet header */
	ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
	ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
	eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
	reply->protocol = htons(ETH_P_IPV6);

	skb_pull(reply, sizeof(struct ethhdr));
2091
	skb_reset_network_header(reply);
2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105
	skb_put(reply, sizeof(struct ipv6hdr));

	/* IPv6 header */

	pip6 = ipv6_hdr(reply);
	memset(pip6, 0, sizeof(struct ipv6hdr));
	pip6->version = 6;
	pip6->priority = ipv6_hdr(request)->priority;
	pip6->nexthdr = IPPROTO_ICMPV6;
	pip6->hop_limit = 255;
	pip6->daddr = ipv6_hdr(request)->saddr;
	pip6->saddr = *(struct in6_addr *)n->primary_key;

	skb_pull(reply, sizeof(struct ipv6hdr));
2106
	skb_reset_transport_header(reply);
2107 2108

	/* Neighbor Advertisement */
2109
	na = skb_put_zero(reply, sizeof(*na) + na_olen);
2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131
	na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
	na->icmph.icmp6_router = isrouter;
	na->icmph.icmp6_override = 1;
	na->icmph.icmp6_solicited = 1;
	na->target = ns->target;
	ether_addr_copy(&na->opt[2], n->ha);
	na->opt[0] = ND_OPT_TARGET_LL_ADDR;
	na->opt[1] = na_olen >> 3;

	na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
		&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
		csum_partial(na, sizeof(*na)+na_olen, 0));

	pip6->payload_len = htons(sizeof(*na)+na_olen);

	skb_push(reply, sizeof(struct ipv6hdr));

	reply->ip_summed = CHECKSUM_UNNECESSARY;

	return reply;
}

2132
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
C
Cong Wang 已提交
2133 2134
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2135
	const struct in6_addr *daddr;
2136
	const struct ipv6hdr *iphdr;
2137
	struct inet6_dev *in6_dev;
2138 2139
	struct neighbour *n;
	struct nd_msg *msg;
C
Cong Wang 已提交
2140 2141 2142 2143 2144 2145 2146

	in6_dev = __in6_dev_get(dev);
	if (!in6_dev)
		goto out;

	iphdr = ipv6_hdr(skb);
	daddr = &iphdr->daddr;
2147
	msg = (struct nd_msg *)(iphdr + 1);
C
Cong Wang 已提交
2148

2149 2150 2151 2152 2153
	if (ipv6_addr_loopback(daddr) ||
	    ipv6_addr_is_multicast(&msg->target))
		goto out;

	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
C
Cong Wang 已提交
2154 2155 2156

	if (n) {
		struct vxlan_fdb *f;
2157
		struct sk_buff *reply;
C
Cong Wang 已提交
2158 2159 2160 2161 2162 2163

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2164
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2165 2166 2167 2168 2169 2170
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

2171 2172 2173
		reply = vxlan_na_create(skb, n,
					!!(f ? f->flags & NTF_ROUTER : 0));

C
Cong Wang 已提交
2174
		neigh_release(n);
2175 2176 2177 2178 2179 2180 2181

		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;

2182
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
2183 2184
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
2185
			.sin6.sin6_family = AF_INET6,
2186 2187
		};

C
Cong Wang 已提交
2188 2189 2190 2191 2192 2193 2194 2195 2196
		vxlan_ip_miss(dev, &ipa);
	}

out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}
#endif

D
David Stevens 已提交
2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct neighbour *n;

	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
		return false;

	n = NULL;
	switch (ntohs(eth_hdr(skb)->h_proto)) {
	case ETH_P_IP:
2208 2209 2210
	{
		struct iphdr *pip;

D
David Stevens 已提交
2211 2212 2213 2214
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			return false;
		pip = ip_hdr(skb);
		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
2215
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
C
Cong Wang 已提交
2216 2217
			union vxlan_addr ipa = {
				.sin.sin_addr.s_addr = pip->daddr,
2218
				.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2219 2220 2221 2222 2223 2224
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

D
David Stevens 已提交
2225
		break;
2226 2227 2228 2229 2230 2231 2232 2233 2234 2235
	}
#if IS_ENABLED(CONFIG_IPV6)
	case ETH_P_IPV6:
	{
		struct ipv6hdr *pip6;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			return false;
		pip6 = ipv6_hdr(skb);
		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
2236
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2237 2238
			union vxlan_addr ipa = {
				.sin6.sin6_addr = pip6->daddr,
2239
				.sin6.sin6_family = AF_INET6,
2240 2241 2242 2243 2244 2245 2246 2247 2248
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

		break;
	}
#endif
D
David Stevens 已提交
2249 2250 2251 2252 2253 2254 2255
	default:
		return false;
	}

	if (n) {
		bool diff;

2256
		diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
D
David Stevens 已提交
2257 2258 2259 2260 2261 2262 2263
		if (diff) {
			memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
				dev->addr_len);
			memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
		}
		neigh_release(n);
		return diff;
C
Cong Wang 已提交
2264 2265
	}

D
David Stevens 已提交
2266 2267 2268
	return false;
}

2269
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
T
Thomas Graf 已提交
2270 2271 2272 2273
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

2274 2275 2276
	if (!md->gbp)
		return;

T
Thomas Graf 已提交
2277
	gbp = (struct vxlanhdr_gbp *)vxh;
2278
	vxh->vx_flags |= VXLAN_HF_GBP;
T
Thomas Graf 已提交
2279 2280 2281 2282 2283 2284 2285 2286 2287 2288

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

J
Jiri Benc 已提交
2289 2290 2291 2292 2293 2294
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
			       __be16 protocol)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

	gpe->np_applied = 1;
2295 2296 2297 2298
	gpe->next_protocol = tun_p_from_eth_p(protocol);
	if (!gpe->next_protocol)
		return -EPFNOSUPPORT;
	return 0;
J
Jiri Benc 已提交
2299 2300
}

2301 2302 2303
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
2304
			   bool udp_sum)
C
Cong Wang 已提交
2305 2306 2307 2308
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
T
Tom Herbert 已提交
2309
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
J
Jiri Benc 已提交
2310
	__be16 inner_protocol = htons(ETH_P_TEB);
T
Tom Herbert 已提交
2311

2312
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
T
Tom Herbert 已提交
2313 2314 2315 2316 2317 2318
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
2319
		     skb->csum_offset == offsetof(struct tcphdr, check)))
T
Tom Herbert 已提交
2320 2321
			type |= SKB_GSO_TUNNEL_REMCSUM;
	}
C
Cong Wang 已提交
2322 2323

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2324
			+ VXLAN_HLEN + iphdr_len;
2325 2326 2327

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
J
Jiri Benc 已提交
2328
	if (unlikely(err))
P
pravin shelar 已提交
2329
		return err;
2330

2331 2332
	err = iptunnel_handle_offloads(skb, type);
	if (err)
P
pravin shelar 已提交
2333
		return err;
2334

2335
	vxh = __skb_push(skb, sizeof(*vxh));
2336 2337
	vxh->vx_flags = VXLAN_HF_VNI;
	vxh->vx_vni = vxlan_vni_field(vni);
2338

T
Tom Herbert 已提交
2339
	if (type & SKB_GSO_TUNNEL_REMCSUM) {
2340
		unsigned int start;
T
Tom Herbert 已提交
2341

2342 2343 2344
		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
		vxh->vx_flags |= VXLAN_HF_RCO;
T
Tom Herbert 已提交
2345 2346 2347 2348 2349 2350 2351

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

2352 2353
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
J
Jiri Benc 已提交
2354 2355 2356
	if (vxflags & VXLAN_F_GPE) {
		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
		if (err < 0)
P
pravin shelar 已提交
2357
			return err;
J
Jiri Benc 已提交
2358 2359
		inner_protocol = skb->protocol;
	}
T
Thomas Graf 已提交
2360

J
Jiri Benc 已提交
2361
	skb_set_inner_protocol(skb, inner_protocol);
2362
	return 0;
2363 2364
}

2365 2366
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
				      struct vxlan_sock *sock4,
2367
				      struct sk_buff *skb, int oif, u8 tos,
2368
				      __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2369
				      struct dst_cache *dst_cache,
2370
				      const struct ip_tunnel_info *info)
2371
{
2372
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2373 2374 2375
	struct rtable *rt = NULL;
	struct flowi4 fl4;

2376 2377 2378
	if (!sock4)
		return ERR_PTR(-EIO);

2379 2380 2381
	if (tos && !info)
		use_cache = false;
	if (use_cache) {
2382 2383 2384 2385 2386
		rt = dst_cache_get_ip4(dst_cache, saddr);
		if (rt)
			return rt;
	}

2387 2388 2389 2390 2391 2392
	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
2393
	fl4.saddr = *saddr;
2394 2395
	fl4.fl4_dport = dport;
	fl4.fl4_sport = sport;
2396 2397

	rt = ip_route_output_key(vxlan->net, &fl4);
2398
	if (!IS_ERR(rt)) {
2399 2400 2401 2402 2403 2404
		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
			ip_rt_put(rt);
			return ERR_PTR(-ELOOP);
		}

2405
		*saddr = fl4.saddr;
2406 2407
		if (use_cache)
			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2408 2409 2410
	} else {
		netdev_dbg(dev, "no route to %pI4\n", &daddr);
		return ERR_PTR(-ENETUNREACH);
2411
	}
2412 2413 2414
	return rt;
}

2415 2416
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2417
					  struct net_device *dev,
2418
					  struct vxlan_sock *sock6,
2419
					  struct sk_buff *skb, int oif, u8 tos,
2420
					  __be32 label,
2421
					  const struct in6_addr *daddr,
2422
					  struct in6_addr *saddr,
2423
					  __be16 dport, __be16 sport,
2424 2425
					  struct dst_cache *dst_cache,
					  const struct ip_tunnel_info *info)
2426
{
2427
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2428 2429 2430
	struct dst_entry *ndst;
	struct flowi6 fl6;

2431 2432 2433
	if (!sock6)
		return ERR_PTR(-EIO);

2434 2435
	if (tos && !info)
		use_cache = false;
2436
	if (use_cache) {
2437 2438 2439 2440 2441
		ndst = dst_cache_get_ip6(dst_cache, saddr);
		if (ndst)
			return ndst;
	}

2442 2443 2444
	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
2445
	fl6.saddr = *saddr;
2446
	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2447 2448
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;
2449 2450
	fl6.fl6_dport = dport;
	fl6.fl6_sport = sport;
2451

2452 2453 2454
	ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk,
					       &fl6, NULL);
	if (unlikely(IS_ERR(ndst))) {
2455 2456 2457 2458 2459 2460 2461 2462 2463
		netdev_dbg(dev, "no route to %pI6\n", daddr);
		return ERR_PTR(-ENETUNREACH);
	}

	if (unlikely(ndst->dev == dev)) {
		netdev_dbg(dev, "circular route to %pI6\n", daddr);
		dst_release(ndst);
		return ERR_PTR(-ELOOP);
	}
2464 2465

	*saddr = fl6.saddr;
2466
	if (use_cache)
2467
		dst_cache_set_ip6(dst_cache, ndst, saddr);
2468 2469 2470 2471
	return ndst;
}
#endif

2472 2473
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2474
			       struct vxlan_dev *dst_vxlan, __be32 vni)
2475
{
2476
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
C
Cong Wang 已提交
2477 2478
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2479
	struct net_device *dev;
2480
	int len = skb->len;
2481

2482 2483
	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2484 2485 2486 2487 2488
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

C
Cong Wang 已提交
2489 2490 2491 2492 2493 2494 2495 2496 2497 2498
	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#endif
	}

2499 2500 2501 2502 2503 2504 2505
	rcu_read_lock();
	dev = skb->dev;
	if (unlikely(!(dev->flags & IFF_UP))) {
		kfree_skb(skb);
		goto drop;
	}

2506
	if (dst_vxlan->cfg.flags & VXLAN_F_LEARN)
2507
		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
2508 2509 2510

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
2511
	tx_stats->tx_bytes += len;
2512 2513 2514 2515 2516
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
2517
		rx_stats->rx_bytes += len;
2518 2519
		u64_stats_update_end(&rx_stats->syncp);
	} else {
2520
drop:
2521
		dev->stats.rx_dropped++;
2522
	}
2523
	rcu_read_unlock();
2524 2525
}

2526
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2527 2528 2529 2530
				 struct vxlan_dev *vxlan,
				 union vxlan_addr *daddr,
				 __be16 dst_port, int dst_ifindex, __be32 vni,
				 struct dst_entry *dst,
2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
				 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
	 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
	 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
	 */
	BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
	/* Bypass encapsulation if the destination is local */
	if (rt_flags & RTCF_LOCAL &&
	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
		struct vxlan_dev *dst_vxlan;

		dst_release(dst);
2546
		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2547
					   daddr->sa.sa_family, dst_port,
2548
					   vxlan->cfg.flags);
2549 2550 2551 2552 2553 2554
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			kfree_skb(skb);

			return -ENOENT;
		}
2555
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni);
2556 2557 2558 2559 2560 2561
		return 1;
	}

	return 0;
}

2562
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2563 2564
			   __be32 default_vni, struct vxlan_rdst *rdst,
			   bool did_rsc)
S
stephen hemminger 已提交
2565
{
2566
	struct dst_cache *dst_cache;
2567
	struct ip_tunnel_info *info;
S
stephen hemminger 已提交
2568
	struct vxlan_dev *vxlan = netdev_priv(dev);
P
pravin shelar 已提交
2569
	const struct iphdr *old_iph = ip_hdr(skb);
C
Cong Wang 已提交
2570
	union vxlan_addr *dst;
2571
	union vxlan_addr remote_ip, local_ip;
T
Thomas Graf 已提交
2572 2573
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
C
Cong Wang 已提交
2574
	__be16 src_port = 0, dst_port;
2575
	struct dst_entry *ndst = NULL;
2576
	__be32 vni, label;
S
stephen hemminger 已提交
2577
	__u8 tos, ttl;
2578
	int ifindex;
2579
	int err;
2580
	u32 flags = vxlan->cfg.flags;
2581
	bool udp_sum = false;
2582
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
S
stephen hemminger 已提交
2583

2584
	info = skb_tunnel_info(skb);
2585

T
Thomas Graf 已提交
2586
	if (rdst) {
P
pravin shelar 已提交
2587 2588 2589 2590
		dst = &rdst->remote_ip;
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
2591
				vxlan_encap_bypass(skb, vxlan, vxlan, default_vni);
P
pravin shelar 已提交
2592 2593 2594 2595 2596
				return;
			}
			goto drop;
		}

2597
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2598
		vni = (rdst->remote_vni) ? : default_vni;
2599
		ifindex = rdst->remote_ifindex;
2600
		local_ip = vxlan->cfg.saddr;
2601
		dst_cache = &rdst->dst_cache;
P
pravin shelar 已提交
2602
		md->gbp = skb->mark;
H
Hangbin Liu 已提交
2603 2604 2605 2606 2607 2608 2609
		if (flags & VXLAN_F_TTL_INHERIT) {
			ttl = ip_tunnel_get_ttl(old_iph, skb);
		} else {
			ttl = vxlan->cfg.ttl;
			if (!ttl && vxlan_addr_multicast(dst))
				ttl = 1;
		}
P
pravin shelar 已提交
2610 2611 2612 2613 2614 2615 2616 2617 2618 2619

		tos = vxlan->cfg.tos;
		if (tos == 1)
			tos = ip_tunnel_get_dsfield(old_iph, skb);

		if (dst->sa.sa_family == AF_INET)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
		label = vxlan->cfg.label;
T
Thomas Graf 已提交
2620 2621 2622 2623 2624 2625
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
2626
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2627
		if (remote_ip.sa.sa_family == AF_INET) {
2628
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2629 2630
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
2631
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2632 2633
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
T
Thomas Graf 已提交
2634
		dst = &remote_ip;
P
pravin shelar 已提交
2635 2636
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = tunnel_id_to_key32(info->key.tun_id);
2637
		ifindex = 0;
2638
		dst_cache = &info->dst_cache;
2639 2640 2641
		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
			if (info->options_len < sizeof(*md))
				goto drop;
P
pravin shelar 已提交
2642
			md = ip_tunnel_info_opts(info);
2643
		}
2644 2645
		ttl = info->key.ttl;
		tos = info->key.tos;
2646
		label = info->key.label;
2647
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2648
	}
P
pravin shelar 已提交
2649 2650
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);
2651

J
Jakub Kicinski 已提交
2652
	rcu_read_lock();
C
Cong Wang 已提交
2653
	if (dst->sa.sa_family == AF_INET) {
2654
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
P
pravin shelar 已提交
2655
		struct rtable *rt;
P
pravin shelar 已提交
2656
		__be16 df = 0;
2657

2658 2659 2660
		if (!ifindex)
			ifindex = sock4->sock->sk->sk_bound_dev_if;

2661
		rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2662
				     dst->sin.sin_addr.s_addr,
2663
				     &local_ip.sin.sin_addr.s_addr,
2664
				     dst_port, src_port,
2665
				     dst_cache, info);
2666 2667
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
P
pravin shelar 已提交
2668
			goto tx_error;
2669
		}
C
Cong Wang 已提交
2670

2671
		if (!info) {
2672
			/* Bypass encapsulation if the destination is local */
2673
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2674 2675
						    dst_port, ifindex, vni,
						    &rt->dst, rt->rt_flags);
2676
			if (err)
J
Jakub Kicinski 已提交
2677
				goto out_unlock;
2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688

			if (vxlan->cfg.df == VXLAN_DF_SET) {
				df = htons(IP_DF);
			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
				struct ethhdr *eth = eth_hdr(skb);

				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
				    (ntohs(eth->h_proto) == ETH_P_IP &&
				     old_iph->frag_off & htons(IP_DF)))
					df = htons(IP_DF);
			}
2689
		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2690
			df = htons(IP_DF);
2691
		}
2692

P
pravin shelar 已提交
2693
		ndst = &rt->dst;
2694
		skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM);
X
Xin Long 已提交
2695

H
Hangbin Liu 已提交
2696
		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
C
Cong Wang 已提交
2697
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
P
pravin shelar 已提交
2698
		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2699
				      vni, md, flags, udp_sum);
2700
		if (err < 0)
P
pravin shelar 已提交
2701
			goto tx_error;
2702

2703
		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2704 2705
				    dst->sin.sin_addr.s_addr, tos, ttl, df,
				    src_port, dst_port, xnet, !udp_sum);
C
Cong Wang 已提交
2706 2707
#if IS_ENABLED(CONFIG_IPV6)
	} else {
2708
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
C
Cong Wang 已提交
2709

2710 2711 2712
		if (!ifindex)
			ifindex = sock6->sock->sk->sk_bound_dev_if;

2713
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2714
					label, &dst->sin6.sin6_addr,
2715
					&local_ip.sin6.sin6_addr,
2716
					dst_port, src_port,
2717
					dst_cache, info);
2718
		if (IS_ERR(ndst)) {
2719
			err = PTR_ERR(ndst);
P
pravin shelar 已提交
2720
			ndst = NULL;
2721
			goto tx_error;
C
Cong Wang 已提交
2722
		}
2723

2724 2725
		if (!info) {
			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2726

2727
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2728 2729
						    dst_port, ifindex, vni,
						    ndst, rt6i_flags);
2730
			if (err)
J
Jakub Kicinski 已提交
2731
				goto out_unlock;
2732
		}
2733

2734
		skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM);
X
Xin Long 已提交
2735

H
Hangbin Liu 已提交
2736
		tos = ip_tunnel_ecn_encap(RT_TOS(tos), old_iph, skb);
C
Cong Wang 已提交
2737
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
2738 2739
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2740
				      vni, md, flags, udp_sum);
P
pravin shelar 已提交
2741 2742 2743
		if (err < 0)
			goto tx_error;

P
pravin shelar 已提交
2744
		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2745
				     &local_ip.sin6.sin6_addr,
2746
				     &dst->sin6.sin6_addr, tos, ttl,
2747
				     label, src_port, dst_port, !udp_sum);
C
Cong Wang 已提交
2748 2749
#endif
	}
J
Jakub Kicinski 已提交
2750 2751
out_unlock:
	rcu_read_unlock();
2752
	return;
S
stephen hemminger 已提交
2753 2754 2755

drop:
	dev->stats.tx_dropped++;
P
pravin shelar 已提交
2756 2757
	dev_kfree_skb(skb);
	return;
S
stephen hemminger 已提交
2758 2759

tx_error:
J
Jakub Kicinski 已提交
2760
	rcu_read_unlock();
2761 2762 2763 2764
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
P
pravin shelar 已提交
2765
	dst_release(ndst);
S
stephen hemminger 已提交
2766
	dev->stats.tx_errors++;
P
pravin shelar 已提交
2767
	kfree_skb(skb);
S
stephen hemminger 已提交
2768 2769
}

2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801
static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
			  struct vxlan_fdb *f, __be32 vni, bool did_rsc)
{
	struct vxlan_rdst nh_rdst;
	struct nexthop *nh;
	bool do_xmit;
	u32 hash;

	memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
	hash = skb_get_hash(skb);

	rcu_read_lock();
	nh = rcu_dereference(f->nh);
	if (!nh) {
		rcu_read_unlock();
		goto drop;
	}
	do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
	rcu_read_unlock();

	if (likely(do_xmit))
		vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
	else
		goto drop;

	return;

drop:
	dev->stats.tx_dropped++;
	dev_kfree_skb(skb);
}

2802 2803 2804 2805 2806 2807 2808 2809 2810
/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2811
	struct vxlan_rdst *rdst, *fdst = NULL;
2812
	const struct ip_tunnel_info *info;
2813 2814
	bool did_rsc = false;
	struct vxlan_fdb *f;
2815
	struct ethhdr *eth;
2816
	__be32 vni = 0;
2817

2818
	info = skb_tunnel_info(skb);
2819

2820 2821
	skb_reset_mac_header(skb);

2822
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832
		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
		    info->mode & IP_TUNNEL_INFO_TX) {
			vni = tunnel_id_to_key32(info->key.tun_id);
		} else {
			if (info && info->mode & IP_TUNNEL_INFO_TX)
				vxlan_xmit_one(skb, dev, vni, NULL, false);
			else
				kfree_skb(skb);
			return NETDEV_TX_OK;
		}
2833 2834
	}

2835
	if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2836
		eth = eth_hdr(skb);
C
Cong Wang 已提交
2837
		if (ntohs(eth->h_proto) == ETH_P_ARP)
2838
			return arp_reduce(dev, skb, vni);
C
Cong Wang 已提交
2839
#if IS_ENABLED(CONFIG_IPV6)
2840 2841 2842 2843 2844 2845 2846 2847
		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
			 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
					    sizeof(struct nd_msg)) &&
			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

			if (m->icmph.icmp6_code == 0 &&
			    m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2848
				return neigh_reduce(dev, skb, vni);
C
Cong Wang 已提交
2849 2850 2851
		}
#endif
	}
2852

2853
	eth = eth_hdr(skb);
2854
	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2855 2856
	did_rsc = false;

2857
	if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2858 2859
	    (ntohs(eth->h_proto) == ETH_P_IP ||
	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
2860 2861
		did_rsc = route_shortcircuit(dev, skb);
		if (did_rsc)
2862
			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2863 2864
	}

2865
	if (f == NULL) {
2866
		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2867
		if (f == NULL) {
2868
			if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2869 2870 2871 2872
			    !is_multicast_ether_addr(eth->h_dest))
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
2873
			kfree_skb(skb);
2874 2875 2876
			return NETDEV_TX_OK;
		}
	}
2877

2878 2879 2880 2881 2882 2883
	if (rcu_access_pointer(f->nh)) {
		vxlan_xmit_nh(skb, dev, f,
			      (vni ? : vxlan->default_dst.remote_vni), did_rsc);
	} else {
		list_for_each_entry_rcu(rdst, &f->remotes, list) {
			struct sk_buff *skb1;
2884

2885 2886 2887 2888 2889 2890 2891
			if (!fdst) {
				fdst = rdst;
				continue;
			}
			skb1 = skb_clone(skb, GFP_ATOMIC);
			if (skb1)
				vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2892
		}
2893 2894 2895 2896
		if (fdst)
			vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
		else
			kfree_skb(skb);
2897 2898
	}

2899
	return NETDEV_TX_OK;
2900 2901
}

S
stephen hemminger 已提交
2902
/* Walk the forwarding table and purge stale entries */
2903
static void vxlan_cleanup(struct timer_list *t)
S
stephen hemminger 已提交
2904
{
2905
	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
S
stephen hemminger 已提交
2906 2907 2908 2909 2910 2911 2912 2913
	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
	unsigned int h;

	if (!netif_running(vxlan->dev))
		return;

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
2914

2915
		spin_lock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2916 2917 2918 2919 2920
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
			unsigned long timeout;

2921
			if (f->state & (NUD_PERMANENT | NUD_NOARP))
S
stephen hemminger 已提交
2922 2923
				continue;

2924 2925 2926
			if (f->flags & NTF_EXT_LEARNED)
				continue;

2927
			timeout = f->used + vxlan->cfg.age_interval * HZ;
S
stephen hemminger 已提交
2928 2929 2930 2931 2932
			if (time_before_eq(timeout, jiffies)) {
				netdev_dbg(vxlan->dev,
					   "garbage collect %pM\n",
					   f->eth_addr);
				f->state = NUD_STALE;
2933
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
2934 2935 2936
			} else if (time_before(timeout, next_timer))
				next_timer = timeout;
		}
2937
		spin_unlock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2938 2939 2940 2941 2942
	}

	mod_timer(&vxlan->age_timer, next_timer);
}

2943 2944 2945 2946 2947
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2948 2949 2950 2951
	hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
	hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
2952 2953 2954
	spin_unlock(&vn->sock_lock);
}

J
Jiri Benc 已提交
2955 2956
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
			     struct vxlan_dev_node *node)
2957
{
2958
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
2959
	__be32 vni = vxlan->default_dst.remote_vni;
2960

J
Jiri Benc 已提交
2961
	node->vxlan = vxlan;
2962
	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
2963
	hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
2964
	spin_unlock(&vn->sock_lock);
2965 2966
}

S
stephen hemminger 已提交
2967 2968 2969
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
2970 2971 2972
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

2973
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
2974
	if (!dev->tstats)
S
stephen hemminger 已提交
2975 2976
		return -ENOMEM;

2977 2978 2979 2980 2981 2982
	err = gro_cells_init(&vxlan->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

S
stephen hemminger 已提交
2983 2984 2985
	return 0;
}

2986
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
2987 2988
{
	struct vxlan_fdb *f;
2989
	u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
2990

2991
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
2992
	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
2993
	if (f)
2994
		vxlan_fdb_destroy(vxlan, f, true, true);
2995
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
2996 2997
}

2998 2999 3000 3001
static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

3002 3003
	gro_cells_destroy(&vxlan->gro_cells);

3004
	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
3005

3006 3007 3008
	free_percpu(dev->tstats);
}

S
stephen hemminger 已提交
3009 3010 3011 3012
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3013
	int ret;
3014

3015 3016 3017
	ret = vxlan_sock_add(vxlan);
	if (ret < 0)
		return ret;
S
stephen hemminger 已提交
3018

3019
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
3020
		ret = vxlan_igmp_join(vxlan);
3021 3022
		if (ret == -EADDRINUSE)
			ret = 0;
3023
		if (ret) {
3024
			vxlan_sock_release(vxlan);
3025 3026
			return ret;
		}
S
stephen hemminger 已提交
3027 3028
	}

3029
	if (vxlan->cfg.age_interval)
S
stephen hemminger 已提交
3030 3031
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

3032
	return ret;
S
stephen hemminger 已提交
3033 3034 3035
}

/* Purge the forwarding table */
3036
static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
S
stephen hemminger 已提交
3037
{
3038
	unsigned int h;
S
stephen hemminger 已提交
3039 3040 3041

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
3042 3043

		spin_lock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3044 3045 3046
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
3047 3048
			if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
				continue;
3049 3050
			/* the all_zeros_mac entry is deleted at vxlan_uninit */
			if (!is_zero_ether_addr(f->eth_addr))
3051
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
3052
		}
3053
		spin_unlock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3054 3055 3056 3057 3058 3059 3060
	}
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
N
Nicolas Dichtel 已提交
3061
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3062
	int ret = 0;
S
stephen hemminger 已提交
3063

3064
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
3065
	    !vxlan_group_used(vn, vxlan))
3066
		ret = vxlan_igmp_leave(vxlan);
S
stephen hemminger 已提交
3067 3068 3069

	del_timer_sync(&vxlan->age_timer);

3070
	vxlan_flush(vxlan, false);
3071
	vxlan_sock_release(vxlan);
S
stephen hemminger 已提交
3072

3073
	return ret;
S
stephen hemminger 已提交
3074 3075 3076 3077 3078 3079 3080
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

3081
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
3082
{
3083 3084 3085 3086
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);
3087
	bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
3088

3089 3090 3091 3092 3093 3094 3095
	/* This check is different than dev->max_mtu, because it looks at
	 * the lowerdev->mtu, rather than the static dev->max_mtu
	 */
	if (lowerdev) {
		int max_mtu = lowerdev->mtu -
			      (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
		if (new_mtu > max_mtu)
D
David Wragg 已提交
3096 3097 3098
			return -EINVAL;
	}

3099 3100 3101 3102
	dev->mtu = new_mtu;
	return 0;
}

3103 3104 3105 3106 3107 3108 3109 3110 3111 3112
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

3113
	if (ip_tunnel_info_af(info) == AF_INET) {
3114
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
3115 3116
		struct rtable *rt;

3117
		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
3118
				     info->key.u.ipv4.dst,
3119 3120
				     &info->key.u.ipv4.src, dport, sport,
				     &info->dst_cache, info);
3121 3122 3123
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
3124 3125
	} else {
#if IS_ENABLED(CONFIG_IPV6)
3126
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
3127 3128
		struct dst_entry *ndst;

3129
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
3130
					info->key.label, &info->key.u.ipv6.dst,
3131 3132
					&info->key.u.ipv6.src, dport, sport,
					&info->dst_cache, info);
3133 3134 3135 3136 3137 3138 3139
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
		return -EPFNOSUPPORT;
#endif
	}
3140 3141
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
3142
	return 0;
3143 3144
}

3145
static const struct net_device_ops vxlan_netdev_ether_ops = {
S
stephen hemminger 已提交
3146
	.ndo_init		= vxlan_init,
3147
	.ndo_uninit		= vxlan_uninit,
S
stephen hemminger 已提交
3148 3149 3150
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
3151
	.ndo_get_stats64	= ip_tunnel_get_stats64,
S
stephen hemminger 已提交
3152
	.ndo_set_rx_mode	= vxlan_set_multicast_list,
3153
	.ndo_change_mtu		= vxlan_change_mtu,
S
stephen hemminger 已提交
3154 3155 3156 3157 3158
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_fdb_add		= vxlan_fdb_add,
	.ndo_fdb_del		= vxlan_fdb_delete,
	.ndo_fdb_dump		= vxlan_fdb_dump,
R
Roopa Prabhu 已提交
3159
	.ndo_fdb_get		= vxlan_fdb_get,
3160
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
3161
	.ndo_change_proto_down  = dev_change_proto_down_generic,
S
stephen hemminger 已提交
3162 3163
};

J
Jiri Benc 已提交
3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174
static const struct net_device_ops vxlan_netdev_raw_ops = {
	.ndo_init		= vxlan_init,
	.ndo_uninit		= vxlan_uninit,
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
	.ndo_change_mtu		= vxlan_change_mtu,
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
};

S
stephen hemminger 已提交
3175 3176 3177 3178 3179
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
	.name = "vxlan",
};

3180
/* Calls the ndo_udp_tunnel_add of the caller in order to
J
Joseph Gasparakis 已提交
3181
 * supply the listening VXLAN udp ports. Callers are expected
3182
 * to implement the ndo_udp_tunnel_add.
3183
 */
3184
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
3185 3186 3187 3188
{
	struct vxlan_sock *vs;
	struct net *net = dev_net(dev);
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
J
Joseph Gasparakis 已提交
3189
	unsigned int i;
3190 3191 3192

	spin_lock(&vn->sock_lock);
	for (i = 0; i < PORT_HASH_SIZE; ++i) {
3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205
		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
			unsigned short type;

			if (vs->flags & VXLAN_F_GPE)
				type = UDP_TUNNEL_TYPE_VXLAN_GPE;
			else
				type = UDP_TUNNEL_TYPE_VXLAN;

			if (push)
				udp_tunnel_push_rx_port(dev, vs->sock, type);
			else
				udp_tunnel_drop_rx_port(dev, vs->sock, type);
		}
3206 3207 3208 3209
	}
	spin_unlock(&vn->sock_lock);
}

S
stephen hemminger 已提交
3210 3211 3212 3213
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3214
	unsigned int h;
S
stephen hemminger 已提交
3215

3216 3217 3218
	eth_hw_addr_random(dev);
	ether_setup(dev);

3219
	dev->needs_free_netdev = true;
S
stephen hemminger 已提交
3220 3221 3222
	SET_NETDEV_DEVTYPE(dev, &vxlan_type);

	dev->features	|= NETIF_F_LLTX;
3223
	dev->features	|= NETIF_F_SG | NETIF_F_HW_CSUM;
3224
	dev->features   |= NETIF_F_RXCSUM;
3225
	dev->features   |= NETIF_F_GSO_SOFTWARE;
3226

3227
	dev->vlan_features = dev->features;
3228
	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
3229
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
3230
	netif_keep_dst(dev);
3231
	dev->priv_flags |= IFF_NO_QUEUE;
S
stephen hemminger 已提交
3232

3233 3234 3235 3236
	/* MTU range: 68 - 65535 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = ETH_MAX_MTU;

3237
	INIT_LIST_HEAD(&vxlan->next);
S
stephen hemminger 已提交
3238

3239
	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
S
stephen hemminger 已提交
3240 3241 3242

	vxlan->dev = dev;

3243 3244
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		spin_lock_init(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3245
		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
3246
	}
S
stephen hemminger 已提交
3247 3248
}

3249 3250 3251 3252 3253 3254 3255
static void vxlan_ether_setup(struct net_device *dev)
{
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
	dev->netdev_ops = &vxlan_netdev_ether_ops;
}

J
Jiri Benc 已提交
3256 3257
static void vxlan_raw_setup(struct net_device *dev)
{
3258
	dev->header_ops = NULL;
J
Jiri Benc 已提交
3259 3260 3261 3262 3263 3264 3265
	dev->type = ARPHRD_NONE;
	dev->hard_header_len = 0;
	dev->addr_len = 0;
	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
	dev->netdev_ops = &vxlan_netdev_raw_ops;
}

S
stephen hemminger 已提交
3266 3267
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
3268
	[IFLA_VXLAN_GROUP]	= { .len = sizeof_field(struct iphdr, daddr) },
C
Cong Wang 已提交
3269
	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3270
	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
3271
	[IFLA_VXLAN_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
C
Cong Wang 已提交
3272
	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3273 3274
	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
3275
	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
3276 3277 3278
	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
3279
	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
D
David Stevens 已提交
3280 3281 3282 3283
	[IFLA_VXLAN_PROXY]	= { .type = NLA_U8 },
	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
3284
	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
3285
	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
3286 3287 3288
	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
T
Tom Herbert 已提交
3289 3290
	[IFLA_VXLAN_REMCSUM_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_REMCSUM_RX]	= { .type = NLA_U8 },
T
Thomas Graf 已提交
3291
	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
J
Jiri Benc 已提交
3292
	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
3293
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
H
Hangbin Liu 已提交
3294
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
3295
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
S
stephen hemminger 已提交
3296 3297
};

3298 3299
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3300 3301 3302
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
3303 3304
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided link layer address is not Ethernet");
S
stephen hemminger 已提交
3305 3306 3307 3308
			return -EINVAL;
		}

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
3309 3310
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided Ethernet address is not unicast");
S
stephen hemminger 已提交
3311 3312 3313 3314
			return -EADDRNOTAVAIL;
		}
	}

3315
	if (tb[IFLA_MTU]) {
3316
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3317

3318 3319 3320
		if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "MTU must be between 68 and 65535");
3321
			return -EINVAL;
3322
		}
3323 3324
	}

3325 3326 3327
	if (!data) {
		NL_SET_ERR_MSG(extack,
			       "Required attributes not provided to perform the operation");
S
stephen hemminger 已提交
3328
		return -EINVAL;
3329
	}
S
stephen hemminger 已提交
3330 3331

	if (data[IFLA_VXLAN_ID]) {
3332 3333
		u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

3334
		if (id >= VXLAN_N_VID) {
3335
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
3336
					    "VXLAN ID must be lower than 16777216");
S
stephen hemminger 已提交
3337
			return -ERANGE;
3338
		}
S
stephen hemminger 已提交
3339 3340
	}

3341 3342 3343 3344 3345
	if (data[IFLA_VXLAN_PORT_RANGE]) {
		const struct ifla_vxlan_port_range *p
			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);

		if (ntohs(p->high) < ntohs(p->low)) {
3346
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
3347
					    "Invalid source port range");
3348 3349 3350 3351
			return -EINVAL;
		}
	}

3352 3353 3354 3355
	if (data[IFLA_VXLAN_DF]) {
		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

		if (df < 0 || df > VXLAN_DF_MAX) {
3356
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
3357 3358 3359 3360 3361
					    "Invalid DF attribute");
			return -EINVAL;
		}
	}

S
stephen hemminger 已提交
3362 3363 3364
	return 0;
}

Y
Yan Burman 已提交
3365 3366 3367 3368 3369 3370 3371
static void vxlan_get_drvinfo(struct net_device *netdev,
			      struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390
static int vxlan_get_link_ksettings(struct net_device *dev,
				    struct ethtool_link_ksettings *cmd)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);

	if (!lowerdev) {
		cmd->base.duplex = DUPLEX_UNKNOWN;
		cmd->base.port = PORT_OTHER;
		cmd->base.speed = SPEED_UNKNOWN;

		return 0;
	}

	return __ethtool_get_link_ksettings(lowerdev, cmd);
}

Y
Yan Burman 已提交
3391
static const struct ethtool_ops vxlan_ethtool_ops = {
3392 3393 3394
	.get_drvinfo		= vxlan_get_drvinfo,
	.get_link		= ethtool_op_get_link,
	.get_link_ksettings	= vxlan_get_link_ksettings,
Y
Yan Burman 已提交
3395 3396
};

T
Tom Herbert 已提交
3397
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3398
					__be16 port, u32 flags, int ifindex)
3399
{
C
Cong Wang 已提交
3400
	struct socket *sock;
T
Tom Herbert 已提交
3401 3402
	struct udp_port_cfg udp_conf;
	int err;
C
Cong Wang 已提交
3403

T
Tom Herbert 已提交
3404
	memset(&udp_conf, 0, sizeof(udp_conf));
C
Cong Wang 已提交
3405

T
Tom Herbert 已提交
3406 3407 3408
	if (ipv6) {
		udp_conf.family = AF_INET6;
		udp_conf.use_udp6_rx_checksums =
3409
		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3410
		udp_conf.ipv6_v6only = 1;
T
Tom Herbert 已提交
3411 3412
	} else {
		udp_conf.family = AF_INET;
C
Cong Wang 已提交
3413 3414
	}

T
Tom Herbert 已提交
3415
	udp_conf.local_udp_port = port;
3416
	udp_conf.bind_ifindex = ifindex;
3417

T
Tom Herbert 已提交
3418 3419 3420 3421
	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);
C
Cong Wang 已提交
3422

Z
Zhi Yong Wu 已提交
3423
	return sock;
C
Cong Wang 已提交
3424 3425 3426
}

/* Create new listen socket if needed */
3427
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3428 3429
					      __be16 port, u32 flags,
					      int ifindex)
C
Cong Wang 已提交
3430 3431 3432 3433 3434
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_sock *vs;
	struct socket *sock;
	unsigned int h;
3435
	struct udp_tunnel_sock_cfg tunnel_cfg;
C
Cong Wang 已提交
3436

3437
	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
C
Cong Wang 已提交
3438 3439 3440 3441 3442 3443
	if (!vs)
		return ERR_PTR(-ENOMEM);

	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vs->vni_list[h]);

3444
	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
Z
Zhi Yong Wu 已提交
3445
	if (IS_ERR(sock)) {
3446
		kfree(vs);
3447
		return ERR_CAST(sock);
3448
	}
C
Cong Wang 已提交
3449 3450

	vs->sock = sock;
3451
	refcount_set(&vs->refcnt, 1);
3452
	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3453

3454 3455
	spin_lock(&vn->sock_lock);
	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3456
	udp_tunnel_notify_add_rx_port(sock,
3457 3458
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
3459
				      UDP_TUNNEL_TYPE_VXLAN);
3460
	spin_unlock(&vn->sock_lock);
3461 3462

	/* Mark socket as an encapsulation socket. */
3463
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3464 3465
	tunnel_cfg.sk_user_data = vs;
	tunnel_cfg.encap_type = 1;
3466
	tunnel_cfg.encap_rcv = vxlan_rcv;
S
Stefano Brivio 已提交
3467
	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3468
	tunnel_cfg.encap_destroy = NULL;
3469 3470
	tunnel_cfg.gro_receive = vxlan_gro_receive;
	tunnel_cfg.gro_complete = vxlan_gro_complete;
3471 3472

	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
C
Cong Wang 已提交
3473

3474 3475 3476
	return vs;
}

3477
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3478
{
3479 3480
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	struct vxlan_sock *vs = NULL;
J
Jiri Benc 已提交
3481
	struct vxlan_dev_node *node;
3482 3483 3484 3485 3486
	int l3mdev_index = 0;

	if (vxlan->cfg.remote_ifindex)
		l3mdev_index = l3mdev_master_upper_ifindex_by_index(
			vxlan->net, vxlan->cfg.remote_ifindex);
3487

3488
	if (!vxlan->cfg.no_share) {
3489
		spin_lock(&vn->sock_lock);
3490
		vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3491 3492
				     vxlan->cfg.dst_port, vxlan->cfg.flags,
				     l3mdev_index);
3493
		if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3494
			spin_unlock(&vn->sock_lock);
3495
			return -EBUSY;
3496 3497 3498
		}
		spin_unlock(&vn->sock_lock);
	}
3499
	if (!vs)
3500
		vs = vxlan_socket_create(vxlan->net, ipv6,
3501 3502
					 vxlan->cfg.dst_port, vxlan->cfg.flags,
					 l3mdev_index);
3503 3504
	if (IS_ERR(vs))
		return PTR_ERR(vs);
3505
#if IS_ENABLED(CONFIG_IPV6)
J
Jiri Benc 已提交
3506
	if (ipv6) {
3507
		rcu_assign_pointer(vxlan->vn6_sock, vs);
J
Jiri Benc 已提交
3508 3509
		node = &vxlan->hlist6;
	} else
3510
#endif
J
Jiri Benc 已提交
3511
	{
3512
		rcu_assign_pointer(vxlan->vn4_sock, vs);
J
Jiri Benc 已提交
3513 3514 3515
		node = &vxlan->hlist4;
	}
	vxlan_vs_add_dev(vs, vxlan, node);
3516
	return 0;
3517 3518
}

3519 3520
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
3521 3522
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3523
	bool ipv4 = !ipv6 || metadata;
3524 3525
	int ret = 0;

3526
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3527
#if IS_ENABLED(CONFIG_IPV6)
3528
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3529
	if (ipv6) {
3530
		ret = __vxlan_sock_add(vxlan, true);
3531 3532 3533
		if (ret < 0 && ret != -EAFNOSUPPORT)
			ipv4 = false;
	}
3534
#endif
3535
	if (ipv4)
3536 3537 3538 3539 3540 3541
		ret = __vxlan_sock_add(vxlan, false);
	if (ret < 0)
		vxlan_sock_release(vxlan);
	return ret;
}

3542 3543
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
3544 3545
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3546
{
3547
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3548
	struct vxlan_dev *tmp;
C
Cong Wang 已提交
3549
	bool use_ipv6 = false;
S
stephen hemminger 已提交
3550

3551 3552 3553 3554 3555 3556 3557 3558
	if (conf->flags & VXLAN_F_GPE) {
		/* For now, allow GPE only together with
		 * COLLECT_METADATA. This can be relaxed later; in such
		 * case, the other side of the PtP link will have to be
		 * provided.
		 */
		if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3559 3560
			NL_SET_ERR_MSG(extack,
				       "VXLAN GPE does not support this combination of attributes");
3561
			return -EINVAL;
3562
		}
J
Jiri Benc 已提交
3563
	}
3564

3565 3566
	if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
		/* Unless IPv6 is explicitly requested, assume IPv4 */
3567
		conf->remote_ip.sa.sa_family = AF_INET;
3568 3569 3570 3571 3572 3573 3574
		conf->saddr.sa.sa_family = AF_INET;
	} else if (!conf->remote_ip.sa.sa_family) {
		conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
	} else if (!conf->saddr.sa.sa_family) {
		conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
	}

3575 3576 3577
	if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
		NL_SET_ERR_MSG(extack,
			       "Local and remote address must be from the same family");
3578
		return -EINVAL;
3579
	}
C
Cong Wang 已提交
3580

3581 3582
	if (vxlan_addr_multicast(&conf->saddr)) {
		NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3583
		return -EINVAL;
3584
	}
3585

3586
	if (conf->saddr.sa.sa_family == AF_INET6) {
3587 3588 3589
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG(extack,
				       "IPv6 support not enabled in the kernel");
3590
			return -EPFNOSUPPORT;
3591
		}
C
Cong Wang 已提交
3592
		use_ipv6 = true;
3593
		conf->flags |= VXLAN_F_IPV6;
3594 3595 3596 3597 3598 3599 3600 3601 3602

		if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			int local_type =
				ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
			int remote_type =
				ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

			if (local_type & IPV6_ADDR_LINKLOCAL) {
				if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3603 3604 3605
				    (remote_type != IPV6_ADDR_ANY)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3606
					return -EINVAL;
3607
				}
3608 3609 3610 3611

				conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
			} else {
				if (remote_type ==
3612 3613 3614
				    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3615
					return -EINVAL;
3616
				}
3617 3618 3619 3620

				conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
			}
		}
3621
	}
S
stephen hemminger 已提交
3622

3623 3624 3625
	if (conf->label && !use_ipv6) {
		NL_SET_ERR_MSG(extack,
			       "Label attribute only applies to IPv6 VXLAN devices");
3626
		return -EINVAL;
3627
	}
3628

3629 3630
	if (conf->remote_ifindex) {
		struct net_device *lowerdev;
3631

3632
		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3633 3634 3635
		if (!lowerdev) {
			NL_SET_ERR_MSG(extack,
				       "Invalid local interface, device not found");
3636
			return -ENODEV;
3637
		}
S
stephen hemminger 已提交
3638

C
Cong Wang 已提交
3639 3640 3641
#if IS_ENABLED(CONFIG_IPV6)
		if (use_ipv6) {
			struct inet6_dev *idev = __in6_dev_get(lowerdev);
3642 3643 3644
			if (idev && idev->cnf.disable_ipv6) {
				NL_SET_ERR_MSG(extack,
					       "IPv6 support disabled by administrator");
C
Cong Wang 已提交
3645
				return -EPERM;
3646
			}
C
Cong Wang 已提交
3647 3648 3649
		}
#endif

3650 3651
		*lower = lowerdev;
	} else {
3652 3653 3654 3655
		if (vxlan_addr_multicast(&conf->remote_ip)) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for multicast remote destination");

3656
			return -EINVAL;
3657
		}
3658

3659
#if IS_ENABLED(CONFIG_IPV6)
3660 3661 3662
		if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for link-local local/remote addresses");
3663
			return -EINVAL;
3664
		}
3665 3666
#endif

3667
		*lower = NULL;
J
Jiri Benc 已提交
3668
	}
S
stephen hemminger 已提交
3669

3670 3671 3672 3673 3674
	if (!conf->dst_port) {
		if (conf->flags & VXLAN_F_GPE)
			conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
		else
			conf->dst_port = htons(vxlan_port);
3675 3676
	}

3677 3678
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;
3679

3680 3681 3682
	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;
3683

3684 3685 3686 3687 3688
		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3689
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3690 3691 3692 3693 3694 3695
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

3696 3697
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
3698
		return -EEXIST;
3699
	}
3700

3701 3702 3703 3704 3705
	return 0;
}

static void vxlan_config_apply(struct net_device *dev,
			       struct vxlan_config *conf,
3706 3707 3708
			       struct net_device *lowerdev,
			       struct net *src_net,
			       bool changelink)
3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	unsigned short needed_headroom = ETH_HLEN;
	bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
	int max_mtu = ETH_MAX_MTU;

	if (!changelink) {
		if (conf->flags & VXLAN_F_GPE)
			vxlan_raw_setup(dev);
		else
			vxlan_ether_setup(dev);

		if (conf->mtu)
			dev->mtu = conf->mtu;
3724 3725

		vxlan->net = src_net;
3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736
	}

	dst->remote_vni = conf->vni;

	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

	if (lowerdev) {
		dst->remote_ifindex = conf->remote_ifindex;

		dev->gso_max_size = lowerdev->gso_max_size;
		dev->gso_max_segs = lowerdev->gso_max_segs;
3737

3738
		needed_headroom = lowerdev->hard_header_len;
3739

3740 3741
		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
					   VXLAN_HEADROOM);
3742 3743 3744 3745 3746
		if (max_mtu < ETH_MIN_MTU)
			max_mtu = ETH_MIN_MTU;

		if (!changelink && !conf->mtu)
			dev->mtu = max_mtu;
3747 3748
	}

3749 3750 3751
	if (dev->mtu > max_mtu)
		dev->mtu = max_mtu;

3752 3753 3754 3755 3756 3757
	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
		needed_headroom += VXLAN6_HEADROOM;
	else
		needed_headroom += VXLAN_HEADROOM;
	dev->needed_headroom = needed_headroom;

3758
	memcpy(&vxlan->cfg, conf, sizeof(*conf));
3759
}
3760

3761
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3762 3763
			       struct vxlan_config *conf, bool changelink,
			       struct netlink_ext_ack *extack)
3764 3765 3766 3767
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct net_device *lowerdev;
	int ret;
3768

3769
	ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3770 3771
	if (ret)
		return ret;
R
Roopa Prabhu 已提交
3772

3773
	vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3774 3775 3776 3777

	return 0;
}

N
Nicolas Dichtel 已提交
3778
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3779 3780
			      struct vxlan_config *conf,
			      struct netlink_ext_ack *extack)
N
Nicolas Dichtel 已提交
3781 3782 3783
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan = netdev_priv(dev);
3784
	struct net_device *remote_dev = NULL;
3785
	struct vxlan_fdb *f = NULL;
3786
	bool unregister = false;
3787
	struct vxlan_rdst *dst;
N
Nicolas Dichtel 已提交
3788 3789
	int err;

3790
	dst = &vxlan->default_dst;
3791
	err = vxlan_dev_configure(net, dev, conf, false, extack);
N
Nicolas Dichtel 已提交
3792 3793 3794 3795 3796 3797
	if (err)
		return err;

	dev->ethtool_ops = &vxlan_ethtool_ops;

	/* create an fdb entry for a valid default destination */
3798
	if (!vxlan_addr_any(&dst->remote_ip)) {
3799
		err = vxlan_fdb_create(vxlan, all_zeros_mac,
3800
				       &dst->remote_ip,
N
Nicolas Dichtel 已提交
3801 3802
				       NUD_REACHABLE | NUD_PERMANENT,
				       vxlan->cfg.dst_port,
3803 3804 3805
				       dst->remote_vni,
				       dst->remote_vni,
				       dst->remote_ifindex,
3806
				       NTF_SELF, 0, &f, extack);
N
Nicolas Dichtel 已提交
3807 3808 3809 3810 3811
		if (err)
			return err;
	}

	err = register_netdevice(dev);
3812 3813
	if (err)
		goto errout;
3814
	unregister = true;
3815

3816 3817 3818 3819 3820 3821 3822 3823 3824 3825
	if (dst->remote_ifindex) {
		remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
		if (!remote_dev)
			goto errout;

		err = netdev_upper_dev_link(remote_dev, dev, extack);
		if (err)
			goto errout;
	}

3826
	err = rtnl_configure_link(dev, NULL);
3827
	if (err)
3828
		goto unlink;
N
Nicolas Dichtel 已提交
3829

3830
	if (f) {
3831
		vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
3832 3833

		/* notify default fdb entry */
3834
		err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
3835
				       RTM_NEWNEIGH, true, extack);
3836 3837
		if (err) {
			vxlan_fdb_destroy(vxlan, f, false, false);
3838 3839
			if (remote_dev)
				netdev_upper_dev_unlink(remote_dev, dev);
3840 3841
			goto unregister;
		}
3842
	}
3843

N
Nicolas Dichtel 已提交
3844
	list_add(&vxlan->next, &vn->vxlan_list);
3845 3846
	if (remote_dev)
		dst->remote_dev = remote_dev;
N
Nicolas Dichtel 已提交
3847
	return 0;
3848 3849 3850
unlink:
	if (remote_dev)
		netdev_upper_dev_unlink(remote_dev, dev);
3851
errout:
3852 3853 3854 3855
	/* unregister_netdevice() destroys the default FDB entry with deletion
	 * notification. But the addition notification was not sent yet, so
	 * destroy the entry by hand here.
	 */
3856
	if (f)
3857 3858
		__vxlan_fdb_free(f);
unregister:
3859 3860
	if (unregister)
		unregister_netdevice(dev);
3861
	return err;
N
Nicolas Dichtel 已提交
3862 3863
}

3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891
/* Set/clear flags based on attribute */
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
			  int attrtype, unsigned long mask, bool changelink,
			  bool changelink_supported,
			  struct netlink_ext_ack *extack)
{
	unsigned long flags;

	if (!tb[attrtype])
		return 0;

	if (changelink && !changelink_supported) {
		vxlan_flag_attr_error(attrtype, extack);
		return -EOPNOTSUPP;
	}

	if (vxlan_policy[attrtype].type == NLA_FLAG)
		flags = conf->flags | mask;
	else if (nla_get_u8(tb[attrtype]))
		flags = conf->flags | mask;
	else
		flags = conf->flags & ~mask;

	conf->flags = flags;

	return 0;
}

R
Roopa Prabhu 已提交
3892 3893
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
			 struct net_device *dev, struct vxlan_config *conf,
3894
			 bool changelink, struct netlink_ext_ack *extack)
3895
{
R
Roopa Prabhu 已提交
3896
	struct vxlan_dev *vxlan = netdev_priv(dev);
3897
	int err = 0;
3898

R
Roopa Prabhu 已提交
3899
	memset(conf, 0, sizeof(*conf));
3900

R
Roopa Prabhu 已提交
3901 3902 3903 3904 3905 3906 3907
	/* if changelink operation, start with old existing cfg */
	if (changelink)
		memcpy(conf, &vxlan->cfg, sizeof(*conf));

	if (data[IFLA_VXLAN_ID]) {
		__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

3908 3909
		if (changelink && (vni != conf->vni)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
R
Roopa Prabhu 已提交
3910
			return -EOPNOTSUPP;
3911
		}
R
Roopa Prabhu 已提交
3912 3913
		conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
	}
3914 3915

	if (data[IFLA_VXLAN_GROUP]) {
3916 3917
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
3918
			return -EOPNOTSUPP;
3919
		}
3920

R
Roopa Prabhu 已提交
3921
		conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3922
		conf->remote_ip.sa.sa_family = AF_INET;
3923
	} else if (data[IFLA_VXLAN_GROUP6]) {
3924 3925
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
3926
			return -EPFNOSUPPORT;
3927
		}
3928

3929 3930
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
3931
			return -EOPNOTSUPP;
3932
		}
3933

R
Roopa Prabhu 已提交
3934 3935
		conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
		conf->remote_ip.sa.sa_family = AF_INET6;
3936 3937 3938
	}

	if (data[IFLA_VXLAN_LOCAL]) {
3939 3940
		if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
3941
			return -EOPNOTSUPP;
3942
		}
3943

R
Roopa Prabhu 已提交
3944 3945
		conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
		conf->saddr.sa.sa_family = AF_INET;
3946
	} else if (data[IFLA_VXLAN_LOCAL6]) {
3947 3948
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
3949
			return -EPFNOSUPPORT;
3950
		}
3951

3952 3953
		if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
3954
			return -EOPNOTSUPP;
3955
		}
3956

3957
		/* TODO: respect scope id */
R
Roopa Prabhu 已提交
3958 3959
		conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
		conf->saddr.sa.sa_family = AF_INET6;
3960 3961 3962
	}

	if (data[IFLA_VXLAN_LINK])
R
Roopa Prabhu 已提交
3963
		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
3964

S
stephen hemminger 已提交
3965
	if (data[IFLA_VXLAN_TOS])
R
Roopa Prabhu 已提交
3966
		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
S
stephen hemminger 已提交
3967

3968
	if (data[IFLA_VXLAN_TTL])
R
Roopa Prabhu 已提交
3969
		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
3970

H
Hangbin Liu 已提交
3971
	if (data[IFLA_VXLAN_TTL_INHERIT]) {
3972 3973 3974 3975 3976 3977
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
				    VXLAN_F_TTL_INHERIT, changelink, false,
				    extack);
		if (err)
			return err;

H
Hangbin Liu 已提交
3978 3979
	}

3980
	if (data[IFLA_VXLAN_LABEL])
R
Roopa Prabhu 已提交
3981
		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
3982 3983
			     IPV6_FLOWLABEL_MASK;

R
Roopa Prabhu 已提交
3984
	if (data[IFLA_VXLAN_LEARNING]) {
3985 3986 3987 3988 3989
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
				    VXLAN_F_LEARN, changelink, true,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
3990 3991 3992 3993
	} else if (!changelink) {
		/* default to learn on a new device */
		conf->flags |= VXLAN_F_LEARN;
	}
S
stephen hemminger 已提交
3994

I
Ido Schimmel 已提交
3995
	if (data[IFLA_VXLAN_AGEING])
R
Roopa Prabhu 已提交
3996
		conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
S
stephen hemminger 已提交
3997

R
Roopa Prabhu 已提交
3998
	if (data[IFLA_VXLAN_PROXY]) {
3999 4000 4001 4002 4003
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
				    VXLAN_F_PROXY, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4004
	}
D
David Stevens 已提交
4005

R
Roopa Prabhu 已提交
4006
	if (data[IFLA_VXLAN_RSC]) {
4007 4008 4009 4010 4011
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
				    VXLAN_F_RSC, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4012
	}
D
David Stevens 已提交
4013

R
Roopa Prabhu 已提交
4014
	if (data[IFLA_VXLAN_L2MISS]) {
4015 4016 4017 4018 4019
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
				    VXLAN_F_L2MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4020
	}
D
David Stevens 已提交
4021

R
Roopa Prabhu 已提交
4022
	if (data[IFLA_VXLAN_L3MISS]) {
4023 4024 4025 4026 4027
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
				    VXLAN_F_L3MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4028
	}
D
David Stevens 已提交
4029

R
Roopa Prabhu 已提交
4030
	if (data[IFLA_VXLAN_LIMIT]) {
4031 4032 4033
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
					    "Cannot change limit");
R
Roopa Prabhu 已提交
4034
			return -EOPNOTSUPP;
4035
		}
R
Roopa Prabhu 已提交
4036 4037
		conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
	}
S
stephen hemminger 已提交
4038

R
Roopa Prabhu 已提交
4039
	if (data[IFLA_VXLAN_COLLECT_METADATA]) {
4040 4041 4042 4043 4044
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
				    VXLAN_F_COLLECT_METADATA, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4045
	}
4046

4047
	if (data[IFLA_VXLAN_PORT_RANGE]) {
R
Roopa Prabhu 已提交
4048 4049 4050 4051 4052 4053
		if (!changelink) {
			const struct ifla_vxlan_port_range *p
				= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
			conf->port_min = ntohs(p->low);
			conf->port_max = ntohs(p->high);
		} else {
4054 4055
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
					    "Cannot change port range");
R
Roopa Prabhu 已提交
4056 4057
			return -EOPNOTSUPP;
		}
4058 4059
	}

R
Roopa Prabhu 已提交
4060
	if (data[IFLA_VXLAN_PORT]) {
4061 4062 4063
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
					    "Cannot change port");
R
Roopa Prabhu 已提交
4064
			return -EOPNOTSUPP;
4065
		}
R
Roopa Prabhu 已提交
4066 4067
		conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
	}
4068

R
Roopa Prabhu 已提交
4069
	if (data[IFLA_VXLAN_UDP_CSUM]) {
4070 4071 4072
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
					    "Cannot change UDP_CSUM flag");
R
Roopa Prabhu 已提交
4073
			return -EOPNOTSUPP;
4074
		}
R
Roopa Prabhu 已提交
4075 4076 4077
		if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
	}
4078

R
Roopa Prabhu 已提交
4079
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
4080 4081 4082 4083 4084
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4085
	}
4086

R
Roopa Prabhu 已提交
4087
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
4088 4089 4090 4091 4092
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
				    VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4093
	}
4094

R
Roopa Prabhu 已提交
4095
	if (data[IFLA_VXLAN_REMCSUM_TX]) {
4096 4097 4098 4099 4100
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
				    VXLAN_F_REMCSUM_TX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4101
	}
T
Tom Herbert 已提交
4102

R
Roopa Prabhu 已提交
4103
	if (data[IFLA_VXLAN_REMCSUM_RX]) {
4104 4105 4106 4107 4108
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
				    VXLAN_F_REMCSUM_RX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4109 4110 4111
	}

	if (data[IFLA_VXLAN_GBP]) {
4112 4113 4114 4115
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
				    VXLAN_F_GBP, changelink, false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4116 4117 4118
	}

	if (data[IFLA_VXLAN_GPE]) {
4119 4120 4121 4122 4123
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
				    VXLAN_F_GPE, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4124 4125 4126
	}

	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
4127 4128 4129 4130 4131
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
				    VXLAN_F_REMCSUM_NOPARTIAL, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4132 4133 4134
	}

	if (tb[IFLA_MTU]) {
4135 4136 4137
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "Cannot change mtu");
R
Roopa Prabhu 已提交
4138
			return -EOPNOTSUPP;
4139
		}
R
Roopa Prabhu 已提交
4140 4141 4142
		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
	}

4143 4144 4145
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

R
Roopa Prabhu 已提交
4146 4147 4148 4149
	return 0;
}

static int vxlan_newlink(struct net *src_net, struct net_device *dev,
4150 4151
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4152 4153 4154 4155
{
	struct vxlan_config conf;
	int err;

4156
	err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
R
Roopa Prabhu 已提交
4157 4158 4159
	if (err)
		return err;

4160
	return __vxlan_dev_create(src_net, dev, &conf, extack);
R
Roopa Prabhu 已提交
4161
}
T
Tom Herbert 已提交
4162

R
Roopa Prabhu 已提交
4163
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
4164 4165
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4166 4167
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4168
	struct net_device *lowerdev;
R
Roopa Prabhu 已提交
4169
	struct vxlan_config conf;
4170
	struct vxlan_rdst *dst;
R
Roopa Prabhu 已提交
4171 4172
	int err;

4173
	dst = &vxlan->default_dst;
4174
	err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
R
Roopa Prabhu 已提交
4175 4176
	if (err)
		return err;
T
Thomas Graf 已提交
4177

4178 4179
	err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
				    vxlan, extack);
R
Roopa Prabhu 已提交
4180 4181
	if (err)
		return err;
4182

4183 4184 4185
	if (dst->remote_dev == lowerdev)
		lowerdev = NULL;

4186 4187 4188 4189 4190
	err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
					     extack);
	if (err)
		return err;

R
Roopa Prabhu 已提交
4191
	/* handle default dst entry */
4192
	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
4193 4194 4195
		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);

		spin_lock_bh(&vxlan->hash_lock[hash_index]);
4196
		if (!vxlan_addr_any(&conf.remote_ip)) {
4197
			err = vxlan_fdb_update(vxlan, all_zeros_mac,
4198
					       &conf.remote_ip,
R
Roopa Prabhu 已提交
4199
					       NUD_REACHABLE | NUD_PERMANENT,
4200
					       NLM_F_APPEND | NLM_F_CREATE,
R
Roopa Prabhu 已提交
4201
					       vxlan->cfg.dst_port,
4202 4203
					       conf.vni, conf.vni,
					       conf.remote_ifindex,
4204
					       NTF_SELF, 0, true, extack);
R
Roopa Prabhu 已提交
4205
			if (err) {
4206
				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4207 4208
				netdev_adjacent_change_abort(dst->remote_dev,
							     lowerdev, dev);
R
Roopa Prabhu 已提交
4209 4210 4211
				return err;
			}
		}
4212 4213 4214 4215 4216 4217 4218 4219
		if (!vxlan_addr_any(&dst->remote_ip))
			__vxlan_fdb_delete(vxlan, all_zeros_mac,
					   dst->remote_ip,
					   vxlan->cfg.dst_port,
					   dst->remote_vni,
					   dst->remote_vni,
					   dst->remote_ifindex,
					   true);
4220
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
R
Roopa Prabhu 已提交
4221
	}
4222

4223 4224 4225
	if (conf.age_interval != vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies);

4226
	netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
4227
	if (lowerdev && lowerdev != dst->remote_dev) {
4228
		dst->remote_dev = lowerdev;
4229 4230
		netdev_update_lockdep_key(lowerdev);
	}
4231
	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
R
Roopa Prabhu 已提交
4232
	return 0;
S
stephen hemminger 已提交
4233 4234 4235 4236 4237 4238
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

4239 4240
	vxlan_flush(vxlan, true);

4241
	list_del(&vxlan->next);
S
stephen hemminger 已提交
4242
	unregister_netdevice_queue(dev, head);
4243 4244
	if (vxlan->default_dst.remote_dev)
		netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
S
stephen hemminger 已提交
4245 4246 4247 4248 4249 4250
}

static size_t vxlan_get_size(const struct net_device *dev)
{

	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
C
Cong Wang 已提交
4251
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
S
stephen hemminger 已提交
4252
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
C
Cong Wang 已提交
4253
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
S
stephen hemminger 已提交
4254
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
H
Hangbin Liu 已提交
4255
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
S
stephen hemminger 已提交
4256
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
4257
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
4258
		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
S
stephen hemminger 已提交
4259
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
D
David Stevens 已提交
4260 4261 4262 4263
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
4264
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
S
stephen hemminger 已提交
4265 4266
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
4267
		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
4268 4269 4270 4271
		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
T
Tom Herbert 已提交
4272 4273
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
S
stephen hemminger 已提交
4274 4275 4276 4277 4278 4279
		0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	const struct vxlan_dev *vxlan = netdev_priv(dev);
4280
	const struct vxlan_rdst *dst = &vxlan->default_dst;
4281
	struct ifla_vxlan_port_range ports = {
4282 4283
		.low =  htons(vxlan->cfg.port_min),
		.high = htons(vxlan->cfg.port_max),
4284
	};
S
stephen hemminger 已提交
4285

4286
	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
S
stephen hemminger 已提交
4287 4288
		goto nla_put_failure;

C
Cong Wang 已提交
4289 4290
	if (!vxlan_addr_any(&dst->remote_ip)) {
		if (dst->remote_ip.sa.sa_family == AF_INET) {
4291 4292
			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
					    dst->remote_ip.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4293 4294 4295
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4296 4297
			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
					     &dst->remote_ip.sin6.sin6_addr))
C
Cong Wang 已提交
4298 4299 4300 4301
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4302

4303
	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
S
stephen hemminger 已提交
4304 4305
		goto nla_put_failure;

4306 4307
	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
4308
			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
4309
					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4310 4311 4312
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4313
			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
4314
					     &vxlan->cfg.saddr.sin6.sin6_addr))
C
Cong Wang 已提交
4315 4316 4317 4318
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4319

4320
	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
H
Hangbin Liu 已提交
4321 4322
	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
4323
	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
4324
	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
4325
	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
D
David Stevens 已提交
4326
	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
4327
		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
D
David Stevens 已提交
4328
	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
4329
		       !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
4330 4331
	    nla_put_u8(skb, IFLA_VXLAN_RSC,
		       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
D
David Stevens 已提交
4332
	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
4333
		       !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
D
David Stevens 已提交
4334
	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
4335
		       !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
4336
	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
4337
		       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
4338 4339 4340
	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
4341
	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
4342
		       !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
4343
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
4344
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
4345
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
4346
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
T
Tom Herbert 已提交
4347
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
4348
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
T
Tom Herbert 已提交
4349
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
4350
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
S
stephen hemminger 已提交
4351 4352
		goto nla_put_failure;

4353 4354 4355
	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
		goto nla_put_failure;

4356
	if (vxlan->cfg.flags & VXLAN_F_GBP &&
T
Thomas Graf 已提交
4357 4358 4359
	    nla_put_flag(skb, IFLA_VXLAN_GBP))
		goto nla_put_failure;

4360
	if (vxlan->cfg.flags & VXLAN_F_GPE &&
J
Jiri Benc 已提交
4361 4362 4363
	    nla_put_flag(skb, IFLA_VXLAN_GPE))
		goto nla_put_failure;

4364
	if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
4365 4366 4367
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

S
stephen hemminger 已提交
4368 4369 4370 4371 4372 4373
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4374 4375 4376 4377 4378 4379 4380
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	return vxlan->net;
}

S
stephen hemminger 已提交
4381 4382 4383 4384 4385 4386 4387 4388
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
	.kind		= "vxlan",
	.maxtype	= IFLA_VXLAN_MAX,
	.policy		= vxlan_policy,
	.priv_size	= sizeof(struct vxlan_dev),
	.setup		= vxlan_setup,
	.validate	= vxlan_validate,
	.newlink	= vxlan_newlink,
R
Roopa Prabhu 已提交
4389
	.changelink	= vxlan_changelink,
S
stephen hemminger 已提交
4390 4391 4392
	.dellink	= vxlan_dellink,
	.get_size	= vxlan_get_size,
	.fill_info	= vxlan_fill_info,
4393
	.get_link_net	= vxlan_get_link_net,
S
stephen hemminger 已提交
4394 4395
};

4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406
struct net_device *vxlan_dev_create(struct net *net, const char *name,
				    u8 name_assign_type,
				    struct vxlan_config *conf)
{
	struct nlattr *tb[IFLA_MAX + 1];
	struct net_device *dev;
	int err;

	memset(&tb, 0, sizeof(tb));

	dev = rtnl_create_link(net, name, name_assign_type,
4407
			       &vxlan_link_ops, tb, NULL);
4408 4409 4410
	if (IS_ERR(dev))
		return dev;

4411
	err = __vxlan_dev_create(net, dev, conf, NULL);
4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429
	if (err < 0) {
		free_netdev(dev);
		return ERR_PTR(err);
	}

	err = rtnl_configure_link(dev, NULL);
	if (err < 0) {
		LIST_HEAD(list_kill);

		vxlan_dellink(dev, &list_kill);
		unregister_netdevice_many(&list_kill);
		return ERR_PTR(err);
	}

	return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
					     struct net_device *dev)
{
	struct vxlan_dev *vxlan, *next;
	LIST_HEAD(list_kill);

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		struct vxlan_rdst *dst = &vxlan->default_dst;

		/* In case we created vxlan device with carrier
		 * and we loose the carrier due to module unload
		 * we also need to remove vxlan device. In other
		 * cases, it's not necessary and remote_ifindex
		 * is 0 here, so no matches.
		 */
		if (dst->remote_ifindex == dev->ifindex)
			vxlan_dellink(vxlan->dev, &list_kill);
	}

	unregister_netdevice_many(&list_kill);
}

4452 4453
static int vxlan_netdevice_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
4454 4455
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4456
	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4457

4458 4459
	if (event == NETDEV_UNREGISTER) {
		vxlan_offload_rx_ports(dev, false);
4460
		vxlan_handle_lowerdev_unregister(vn, dev);
4461 4462 4463 4464
	} else if (event == NETDEV_REGISTER) {
		vxlan_offload_rx_ports(dev, true);
	} else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
		   event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4465
		vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4466
	}
4467 4468 4469 4470 4471

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
4472
	.notifier_call = vxlan_netdevice_event,
4473 4474
};

4475 4476 4477 4478 4479 4480 4481
static void
vxlan_fdb_offloaded_set(struct net_device *dev,
			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
4482 4483 4484
	u32 hash_index;

	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4485

4486
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		goto out;

	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
				   fdb_info->remote_port,
				   fdb_info->remote_vni,
				   fdb_info->remote_ifindex);
	if (!rdst)
		goto out;

	rdst->offloaded = fdb_info->offloaded;

out:
4502
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4503 4504
}

P
Petr Machata 已提交
4505 4506 4507 4508 4509
static int
vxlan_fdb_external_learn_add(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4510
	struct netlink_ext_ack *extack;
4511
	u32 hash_index;
P
Petr Machata 已提交
4512 4513
	int err;

4514
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4515 4516
	extack = switchdev_notifier_info_to_extack(&fdb_info->info);

4517
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4518 4519 4520 4521 4522 4523 4524 4525
	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
			       NUD_REACHABLE,
			       NLM_F_CREATE | NLM_F_REPLACE,
			       fdb_info->remote_port,
			       fdb_info->vni,
			       fdb_info->remote_vni,
			       fdb_info->remote_ifindex,
			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4526
			       0, false, extack);
4527
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4528 4529 4530 4531 4532 4533 4534 4535 4536 4537

	return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
4538
	u32 hash_index;
P
Petr Machata 已提交
4539 4540
	int err = 0;

4541 4542
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		err = -ENOENT;
	else if (f->flags & NTF_EXT_LEARNED)
		err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
					 fdb_info->remote_ip,
					 fdb_info->remote_port,
					 fdb_info->vni,
					 fdb_info->remote_vni,
					 fdb_info->remote_ifindex,
					 false);

4556
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4557 4558 4559 4560

	return err;
}

4561 4562 4563 4564
static int vxlan_switchdev_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
{
	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
P
Petr Machata 已提交
4565 4566
	struct switchdev_notifier_vxlan_fdb_info *fdb_info;
	int err = 0;
4567 4568 4569 4570 4571

	switch (event) {
	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
		vxlan_fdb_offloaded_set(dev, ptr);
		break;
P
Petr Machata 已提交
4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591
	case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_add(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = true;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
	case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_del(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = false;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
4592 4593
	}

P
Petr Machata 已提交
4594
	return err;
4595 4596 4597 4598 4599 4600
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
	.notifier_call = vxlan_switchdev_event,
};

S
stephen hemminger 已提交
4601 4602 4603
static __net_init int vxlan_init_net(struct net *net)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4604
	unsigned int h;
S
stephen hemminger 已提交
4605

4606
	INIT_LIST_HEAD(&vn->vxlan_list);
4607
	spin_lock_init(&vn->sock_lock);
S
stephen hemminger 已提交
4608

4609 4610
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vn->sock_list[h]);
S
stephen hemminger 已提交
4611 4612 4613 4614

	return 0;
}

4615
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
N
Nicolas Dichtel 已提交
4616 4617 4618 4619
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan, *next;
	struct net_device *dev, *aux;
4620
	unsigned int h;
N
Nicolas Dichtel 已提交
4621 4622 4623

	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &vxlan_link_ops)
4624
			unregister_netdevice_queue(dev, head);
N
Nicolas Dichtel 已提交
4625 4626 4627 4628 4629

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		/* If vxlan->dev is in the same netns, it has already been added
		 * to the list by the previous loop.
		 */
4630
		if (!net_eq(dev_net(vxlan->dev), net))
4631
			unregister_netdevice_queue(vxlan->dev, head);
N
Nicolas Dichtel 已提交
4632 4633
	}

4634 4635
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
N
Nicolas Dichtel 已提交
4636 4637
}

4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650
static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
{
	struct net *net;
	LIST_HEAD(list);

	rtnl_lock();
	list_for_each_entry(net, net_list, exit_list)
		vxlan_destroy_tunnels(net, &list);

	unregister_netdevice_many(&list);
	rtnl_unlock();
}

S
stephen hemminger 已提交
4651 4652
static struct pernet_operations vxlan_net_ops = {
	.init = vxlan_init_net,
4653
	.exit_batch = vxlan_exit_batch_net,
S
stephen hemminger 已提交
4654 4655 4656 4657 4658 4659 4660 4661 4662 4663
	.id   = &vxlan_net_id,
	.size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
	int rc;

	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

4664
	rc = register_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4665 4666 4667
	if (rc)
		goto out1;

4668
	rc = register_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4669 4670 4671
	if (rc)
		goto out2;

4672
	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4673 4674
	if (rc)
		goto out3;
S
stephen hemminger 已提交
4675

4676 4677 4678 4679
	rc = rtnl_link_register(&vxlan_link_ops);
	if (rc)
		goto out4;

4680
	return 0;
4681 4682
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4683 4684
out3:
	unregister_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4685
out2:
4686
	unregister_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4687 4688 4689
out1:
	return rc;
}
4690
late_initcall(vxlan_init_module);
S
stephen hemminger 已提交
4691 4692 4693

static void __exit vxlan_cleanup_module(void)
{
4694
	rtnl_link_unregister(&vxlan_link_ops);
4695
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4696
	unregister_netdevice_notifier(&vxlan_notifier_block);
4697 4698
	unregister_pernet_subsys(&vxlan_net_ops);
	/* rcu_barrier() is called by netns */
S
stephen hemminger 已提交
4699 4700 4701 4702 4703
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
4704
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
J
Jesse Brandeburg 已提交
4705
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
S
stephen hemminger 已提交
4706
MODULE_ALIAS_RTNL_LINK("vxlan");