vxlan.c 120.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
S
stephen hemminger 已提交
2
/*
R
Rami Rosen 已提交
3
 * VXLAN: Virtual eXtensible Local Area Network
S
stephen hemminger 已提交
4
 *
5
 * Copyright (c) 2012-2013 Vyatta Inc.
S
stephen hemminger 已提交
6 7 8 9 10 11 12 13 14 15 16
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Y
Yan Burman 已提交
17
#include <linux/ethtool.h>
D
David Stevens 已提交
18 19
#include <net/arp.h>
#include <net/ndisc.h>
20
#include <net/ipv6_stubs.h>
S
stephen hemminger 已提交
21 22 23 24 25 26
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
27
#include <net/tun_proto.h>
28
#include <net/vxlan.h>
29
#include <net/nexthop.h>
30

C
Cong Wang 已提交
31 32
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
33
#include <net/ip6_checksum.h>
C
Cong Wang 已提交
34
#endif
S
stephen hemminger 已提交
35 36 37

#define VXLAN_VERSION	"0.1"

38 39
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
S
stephen hemminger 已提交
40 41 42
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

43 44
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
S
Stephen Hemminger 已提交
45
 * for compatibility with early adopters.
46
 */
47 48
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
S
stephen hemminger 已提交
49 50 51 52 53 54
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

55
static unsigned int vxlan_net_id;
56
static struct rtnl_link_ops vxlan_link_ops;
57

58
static const u8 all_zeros_mac[ETH_ALEN + 2];
59

60
static int vxlan_sock_add(struct vxlan_dev *vxlan);
61

62 63
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

64 65 66 67
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
68
	spinlock_t	  sock_lock;
69 70
};

S
stephen hemminger 已提交
71 72 73 74 75 76
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
77
	struct list_head  remotes;
78
	u8		  eth_addr[ETH_ALEN];
S
stephen hemminger 已提交
79
	u16		  state;	/* see ndm_state */
80
	__be32		  vni;
P
Petr Machata 已提交
81
	u16		  flags;	/* see ndm_flags and below */
82 83
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
84
	struct vxlan_dev  __rcu *vdev;
S
stephen hemminger 已提交
85 86
};

P
Petr Machata 已提交
87 88
#define NTF_VXLAN_ADDED_BY_USER 0x100

S
stephen hemminger 已提交
89 90 91
/* salt for hash table */
static u32 vxlan_salt __read_mostly;

T
Thomas Graf 已提交
92 93
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
94 95
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
T
Thomas Graf 已提交
96 97
}

C
Cong Wang 已提交
98 99 100 101
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
102 103 104 105 106 107
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
108 109 110 111
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
112
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
113
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
J
Jiri Benc 已提交
114 115 116
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
117
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
118 119 120 121 122
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
123 124 125
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
126
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
127
{
J
Jiri Benc 已提交
128
	if (ip->sa.sa_family == AF_INET6)
129
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
J
Jiri Benc 已提交
130
	else
131
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
132 133 134 135 136 137 138
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
139
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
140 141 142 143
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
144 145 146
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
147
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
148 149 150 151 152
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
153 154 155
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
156
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
157
{
158
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
159 160 161
}
#endif

162
/* Virtual Network hash table head */
163
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
164
{
165
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
166 167 168 169
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
S
stephen hemminger 已提交
170 171 172
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

173 174 175
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

176 177 178
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
179
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
180
{
181 182
	if (rcu_access_pointer(fdb->nh))
		return NULL;
183 184 185 186 187
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
188 189
	if (rcu_access_pointer(fdb->nh))
		return NULL;
190
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
191 192
}

193 194 195 196
/* Find VXLAN socket based on network namespace, address family and UDP port
 * and enabled unshareable flags.
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
197
					  __be16 port, u32 flags, int ifindex)
198 199
{
	struct vxlan_sock *vs;
200 201

	flags &= VXLAN_F_RCV_FLAGS;
202 203

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
204
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
205
		    vxlan_get_sk_family(vs) == family &&
206 207
		    vs->flags == flags &&
		    vs->sock->sk->sk_bound_dev_if == ifindex)
208 209 210
			return vs;
	}
	return NULL;
S
stephen hemminger 已提交
211 212
}

213 214
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
S
stephen hemminger 已提交
215
{
J
Jiri Benc 已提交
216
	struct vxlan_dev_node *node;
S
stephen hemminger 已提交
217

218 219 220 221
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

J
Jiri Benc 已提交
222 223
	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
224 225 226
			continue;

		if (IS_ENABLED(CONFIG_IPV6)) {
J
Jiri Benc 已提交
227
			const struct vxlan_config *cfg = &node->vxlan->cfg;
228 229 230 231 232 233

			if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
			    cfg->remote_ifindex != ifindex)
				continue;
		}

J
Jiri Benc 已提交
234
		return node->vxlan;
S
stephen hemminger 已提交
235 236 237 238 239
	}

	return NULL;
}

P
Pravin B Shelar 已提交
240
/* Look up VNI in a per net namespace table */
241 242 243
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
					__be32 vni, sa_family_t family,
					__be16 port, u32 flags)
P
Pravin B Shelar 已提交
244 245 246
{
	struct vxlan_sock *vs;

247
	vs = vxlan_find_sock(net, family, port, flags, ifindex);
P
Pravin B Shelar 已提交
248 249 250
	if (!vs)
		return NULL;

251
	return vxlan_vs_find_vni(vs, ifindex, vni);
P
Pravin B Shelar 已提交
252 253
}

S
stephen hemminger 已提交
254 255
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
S
Stephen Hemminger 已提交
256 257 258
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
S
stephen hemminger 已提交
259 260 261
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
262
	bool send_ip, send_eth;
S
stephen hemminger 已提交
263
	struct nlmsghdr *nlh;
264
	struct nexthop *nh;
S
stephen hemminger 已提交
265
	struct ndmsg *ndm;
266 267
	int nh_family;
	u32 nh_id;
S
stephen hemminger 已提交
268 269 270 271 272 273 274

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
D
David Stevens 已提交
275 276 277

	send_eth = send_ip = true;

278 279 280 281 282 283 284 285
	rcu_read_lock();
	nh = rcu_dereference(fdb->nh);
	if (nh) {
		nh_family = nexthop_get_family(nh);
		nh_id = nh->id;
	}
	rcu_read_unlock();

D
David Stevens 已提交
286
	if (type == RTM_GETNEIGH) {
287 288 289 290
		if (rdst) {
			send_ip = !vxlan_addr_any(&rdst->remote_ip);
			ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
		} else if (nh) {
291
			ndm->ndm_family = nh_family;
292
		}
D
David Stevens 已提交
293 294 295
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
	} else
		ndm->ndm_family	= AF_BRIDGE;
S
stephen hemminger 已提交
296 297
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
298
	ndm->ndm_flags = fdb->flags;
299
	if (rdst && rdst->offloaded)
300
		ndm->ndm_flags |= NTF_OFFLOADED;
301
	ndm->ndm_type = RTN_UNICAST;
S
stephen hemminger 已提交
302

303
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
304
	    nla_put_s32(skb, NDA_LINK_NETNSID,
305
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
306 307
		goto nla_put_failure;

D
David Stevens 已提交
308
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
S
stephen hemminger 已提交
309
		goto nla_put_failure;
310
	if (nh) {
311
		if (nla_put_u32(skb, NDA_NH_ID, nh_id))
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
			goto nla_put_failure;
	} else if (rdst) {
		if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
						  &rdst->remote_ip))
			goto nla_put_failure;

		if (rdst->remote_port &&
		    rdst->remote_port != vxlan->cfg.dst_port &&
		    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
			goto nla_put_failure;
		if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
		    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
			goto nla_put_failure;
		if (rdst->remote_ifindex &&
		    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
			goto nla_put_failure;
	}
S
stephen hemminger 已提交
329

330
	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
331 332 333
	    nla_put_u32(skb, NDA_SRC_VNI,
			be32_to_cpu(fdb->vni)))
		goto nla_put_failure;
S
stephen hemminger 已提交
334 335 336 337 338 339 340 341 342

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

343 344
	nlmsg_end(skb, nlh);
	return 0;
S
stephen hemminger 已提交
345 346 347 348 349 350 351 352 353 354

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
C
Cong Wang 已提交
355
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
356
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
357 358
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
359
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
S
stephen hemminger 已提交
360 361 362
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

P
Petr Machata 已提交
363 364
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       struct vxlan_rdst *rd, int type)
S
stephen hemminger 已提交
365 366 367 368 369 370 371 372 373
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

374
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
S
stephen hemminger 已提交
375 376 377 378 379 380 381 382 383 384 385 386 387 388
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

389 390 391
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
			    const struct vxlan_fdb *fdb,
			    const struct vxlan_rdst *rd,
392
			    struct netlink_ext_ack *extack,
393 394 395
			    struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	fdb_info->info.dev = vxlan->dev;
396
	fdb_info->info.extack = extack;
397 398 399 400 401 402 403 404 405 406
	fdb_info->remote_ip = rd->remote_ip;
	fdb_info->remote_port = rd->remote_port;
	fdb_info->remote_vni = rd->remote_vni;
	fdb_info->remote_ifindex = rd->remote_ifindex;
	memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
	fdb_info->vni = fdb->vni;
	fdb_info->offloaded = rd->offloaded;
	fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

407 408 409
static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
					      struct vxlan_fdb *fdb,
					      struct vxlan_rdst *rd,
410 411
					      bool adding,
					      struct netlink_ext_ack *extack)
P
Petr Machata 已提交
412 413 414
{
	struct switchdev_notifier_vxlan_fdb_info info;
	enum switchdev_notifier_type notifier_type;
415
	int ret;
P
Petr Machata 已提交
416 417

	if (WARN_ON(!rd))
418
		return 0;
P
Petr Machata 已提交
419 420 421

	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
422
	vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
423
	ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
424
				       &info.info, extack);
425
	return notifier_to_errno(ret);
P
Petr Machata 已提交
426 427
}

428
static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
429 430
			    struct vxlan_rdst *rd, int type, bool swdev_notify,
			    struct netlink_ext_ack *extack)
P
Petr Machata 已提交
431
{
432 433
	int err;

434
	if (swdev_notify && rd) {
435 436
		switch (type) {
		case RTM_NEWNEIGH:
437
			err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
438
								 true, extack);
439 440
			if (err)
				return err;
441 442 443
			break;
		case RTM_DELNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
444
							   false, extack);
445 446
			break;
		}
P
Petr Machata 已提交
447 448 449
	}

	__vxlan_fdb_notify(vxlan, fdb, rd, type);
450
	return 0;
P
Petr Machata 已提交
451 452
}

C
Cong Wang 已提交
453
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
D
David Stevens 已提交
454 455
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
456 457 458 459
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
C
Cong Wang 已提交
460
		.remote_ip = *ipa, /* goes to NDA_DST */
461
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
462
	};
463

464
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
465 466 467 468
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
469 470 471
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
472
	struct vxlan_rdst remote = { };
D
David Stevens 已提交
473 474 475

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

476
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
477 478
}

S
stephen hemminger 已提交
479 480 481 482 483 484 485 486
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
487 488
#else
	value <<= 16;
S
stephen hemminger 已提交
489 490 491 492
#endif
	return hash_64(value, FDB_HASH_BITS);
}

493 494 495 496 497 498 499 500
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));

	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

501 502 503 504 505 506 507 508
static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
		return eth_vni_hash(mac, vni);
	else
		return eth_hash(mac);
}

S
stephen hemminger 已提交
509 510
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
511
						const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
512
{
513
	return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
S
stephen hemminger 已提交
514 515 516
}

/* Look up Ethernet address in forwarding table */
517
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
518
					  const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
519
{
520
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
S
stephen hemminger 已提交
521 522
	struct vxlan_fdb *f;

523
	hlist_for_each_entry_rcu(f, head, hlist) {
524
		if (ether_addr_equal(mac, f->eth_addr)) {
525
			if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
526 527 528 529 530 531
				if (vni == f->vni)
					return f;
			} else {
				return f;
			}
		}
S
stephen hemminger 已提交
532 533 534 535 536
	}

	return NULL;
}

537
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
538
					const u8 *mac, __be32 vni)
539 540 541
{
	struct vxlan_fdb *f;

542
	f = __vxlan_find_mac(vxlan, mac, vni);
543
	if (f && f->used != jiffies)
544 545 546 547 548
		f->used = jiffies;

	return f;
}

549 550
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
C
Cong Wang 已提交
551
					      union vxlan_addr *ip, __be16 port,
552
					      __be32 vni, __u32 ifindex)
553
{
554
	struct vxlan_rdst *rd;
555

556
	list_for_each_entry(rd, &f->remotes, list) {
C
Cong Wang 已提交
557
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
558 559 560
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
561
			return rd;
562
	}
563

564 565 566
	return NULL;
}

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	u8 eth_addr[ETH_ALEN + 2] = { 0 };
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	int rc = 0;

	if (is_multicast_ether_addr(mac) ||
	    is_zero_ether_addr(mac))
		return -EINVAL;

	ether_addr_copy(eth_addr, mac);

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, eth_addr, vni);
	if (!f) {
		rc = -ENOENT;
		goto out;
	}

	rdst = first_remote_rcu(f);
591
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
592 593 594 595 596 597 598

out:
	rcu_read_unlock();
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

P
Petr Machata 已提交
599 600 601
static int vxlan_fdb_notify_one(struct notifier_block *nb,
				const struct vxlan_dev *vxlan,
				const struct vxlan_fdb *f,
602 603
				const struct vxlan_rdst *rdst,
				struct netlink_ext_ack *extack)
P
Petr Machata 已提交
604 605 606 607
{
	struct switchdev_notifier_vxlan_fdb_info fdb_info;
	int rc;

608
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
P
Petr Machata 已提交
609 610 611 612 613 614
	rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
			       &fdb_info);
	return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
615 616
		     struct notifier_block *nb,
		     struct netlink_ext_ack *extack)
P
Petr Machata 已提交
617 618 619 620 621 622 623 624 625 626 627 628
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;
	int rc = 0;

	if (!netif_is_vxlan(dev))
		return -EINVAL;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
629
		spin_lock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
630 631 632 633
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
			if (f->vni == vni) {
				list_for_each_entry(rdst, &f->remotes, list) {
					rc = vxlan_fdb_notify_one(nb, vxlan,
634 635
								  f, rdst,
								  extack);
P
Petr Machata 已提交
636
					if (rc)
637
						goto unlock;
P
Petr Machata 已提交
638 639 640
				}
			}
		}
641
		spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
642
	}
643
	return 0;
P
Petr Machata 已提交
644

645 646
unlock:
	spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
647 648 649 650
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

651 652 653 654 655 656 657 658 659 660 661 662
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;

	if (!netif_is_vxlan(dev))
		return;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
663
		spin_lock_bh(&vxlan->hash_lock[h]);
664 665 666 667
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
			if (f->vni == vni)
				list_for_each_entry(rdst, &f->remotes, list)
					rdst->offloaded = false;
668
		spin_unlock_bh(&vxlan->hash_lock[h]);
669
	}
670

671 672 673
}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

674 675
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
676
			     union vxlan_addr *ip, __be16 port, __be32 vni,
677
			     __u32 ifindex, struct vxlan_rdst *oldrd)
678 679 680 681 682 683 684 685 686 687
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
688

689
	*oldrd = *rd;
690
	dst_cache_reset(&rd->dst_cache);
C
Cong Wang 已提交
691
	rd->remote_ip = *ip;
692 693 694
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
695
	rd->offloaded = false;
696 697 698
	return 1;
}

699 700
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
701
			    union vxlan_addr *ip, __be16 port, __be32 vni,
702
			    __u32 ifindex, struct vxlan_rdst **rdp)
703 704 705 706 707 708 709
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

710 711 712
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
713 714 715 716 717 718

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

C
Cong Wang 已提交
719
	rd->remote_ip = *ip;
720
	rd->remote_port = port;
721
	rd->offloaded = false;
722 723
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
724 725 726

	list_add_tail_rcu(&rd->list, &f->remotes);

727
	*rdp = rd;
728 729 730
	return 1;
}

T
Tom Herbert 已提交
731 732 733
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
734 735
					  __be32 vni_field,
					  struct gro_remcsum *grc,
736
					  bool nopartial)
T
Tom Herbert 已提交
737
{
738
	size_t start, offset;
T
Tom Herbert 已提交
739 740

	if (skb->remcsum_offload)
741
		return vh;
T
Tom Herbert 已提交
742 743 744 745

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

746 747
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
T
Tom Herbert 已提交
748

749 750
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
T
Tom Herbert 已提交
751 752 753 754 755 756

	skb->remcsum_offload = 1;

	return vh;
}

757 758 759
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
					 struct list_head *head,
					 struct sk_buff *skb)
760
{
761 762
	struct sk_buff *pp = NULL;
	struct sk_buff *p;
763
	struct vxlanhdr *vh, *vh2;
764
	unsigned int hlen, off_vx;
765
	int flush = 1;
766
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
767
	__be32 flags;
768 769 770
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
771 772 773 774 775 776 777 778 779 780

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

T
Tom Herbert 已提交
781 782
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

783
	flags = vh->vx_flags;
T
Tom Herbert 已提交
784 785 786

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
787
				       vh->vx_vni, &grc,
788 789
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
T
Tom Herbert 已提交
790 791 792 793 794

		if (!vh)
			goto out;
	}

795 796
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

797
	list_for_each_entry(p, head, list) {
798 799 800 801
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
T
Thomas Graf 已提交
802 803
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
804 805 806 807 808
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

S
Sabrina Dubroca 已提交
809
	pp = call_gro_receive(eth_gro_receive, head, skb);
810
	flush = 0;
811 812

out:
813
	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
814 815 816 817

	return pp;
}

818
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
819
{
820 821 822
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
823
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
824 825
}

826 827 828
static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
					 __u16 state, __be32 src_vni,
					 __u16 ndm_flags)
829 830 831 832 833 834 835 836 837 838
{
	struct vxlan_fdb *f;

	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	if (!f)
		return NULL;
	f->state = state;
	f->flags = ndm_flags;
	f->updated = f->used = jiffies;
	f->vni = src_vni;
839
	f->nh = NULL;
840
	RCU_INIT_POINTER(f->vdev, vxlan);
841
	INIT_LIST_HEAD(&f->nh_list);
842 843 844 845 846 847
	INIT_LIST_HEAD(&f->remotes);
	memcpy(f->eth_addr, mac, ETH_ALEN);

	return f;
}

848 849 850 851 852 853 854 855
static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
			     __be32 src_vni, struct vxlan_fdb *f)
{
	++vxlan->addrcnt;
	hlist_add_head_rcu(&f->hlist,
			   vxlan_fdb_head(vxlan, mac, src_vni));
}

856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877
static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       u32 nhid, struct netlink_ext_ack *extack)
{
	struct nexthop *old_nh = rtnl_dereference(fdb->nh);
	struct nexthop *nh;
	int err = -EINVAL;

	if (old_nh && old_nh->id == nhid)
		return 0;

	nh = nexthop_find_by_id(vxlan->net, nhid);
	if (!nh) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		goto err_inval;
	}

	if (nh) {
		if (!nexthop_get(nh)) {
			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
			nh = NULL;
			goto err_inval;
		}
878
		if (!nexthop_is_fdb(nh)) {
879 880 881 882
			NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
			goto err_inval;
		}

883
		if (!nexthop_is_multipath(nh)) {
884 885 886 887 888 889 890
			NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
			goto err_inval;
		}

		/* check nexthop group family */
		switch (vxlan->default_dst.remote_ip.sa.sa_family) {
		case AF_INET:
891
			if (!nexthop_has_v4(nh)) {
892 893 894 895 896 897
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
			break;
		case AF_INET6:
898
			if (nexthop_has_v4(nh)) {
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
		}
	}

	if (old_nh) {
		list_del_rcu(&fdb->nh_list);
		nexthop_put(old_nh);
	}
	rcu_assign_pointer(fdb->nh, nh);
	list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
	return 1;

err_inval:
	if (nh)
		nexthop_put(nh);
	return err;
}

S
stephen hemminger 已提交
920
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
921 922
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __be16 port, __be32 src_vni,
P
Petr Machata 已提交
923
			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
924 925
			    u32 nhid, struct vxlan_fdb **fdb,
			    struct netlink_ext_ack *extack)
926 927 928 929 930 931 932 933 934 935
{
	struct vxlan_rdst *rd = NULL;
	struct vxlan_fdb *f;
	int rc;

	if (vxlan->cfg.addrmax &&
	    vxlan->addrcnt >= vxlan->cfg.addrmax)
		return -ENOSPC;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
936
	f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
937 938 939
	if (!f)
		return -ENOMEM;

940 941 942 943 944 945
	if (nhid)
		rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
	else
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
	if (rc < 0)
		goto errout;
946 947 948 949

	*fdb = f;

	return 0;
950 951 952 953

errout:
	kfree(f);
	return rc;
954 955
}

956
static void __vxlan_fdb_free(struct vxlan_fdb *f)
957 958
{
	struct vxlan_rdst *rd, *nd;
959 960 961 962 963
	struct nexthop *nh;

	nh = rcu_dereference_raw(f->nh);
	if (nh) {
		rcu_assign_pointer(f->nh, NULL);
964
		rcu_assign_pointer(f->vdev, NULL);
965 966
		nexthop_put(nh);
	}
967 968 969 970 971 972 973 974

	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
		kfree(rd);
	}
	kfree(f);
}

975 976 977 978 979 980 981
static void vxlan_fdb_free(struct rcu_head *head)
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);

	__vxlan_fdb_free(f);
}

982 983 984 985 986 987 988 989
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
			      bool do_notify, bool swdev_notify)
{
	struct vxlan_rdst *rd;

	netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
990 991 992
	if (do_notify) {
		if (rcu_access_pointer(f->nh))
			vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
993
					 swdev_notify, NULL);
994 995 996 997 998
		else
			list_for_each_entry(rd, &f->remotes, list)
				vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
						 swdev_notify, NULL);
	}
999 1000

	hlist_del_rcu(&f->hlist);
1001
	list_del_rcu(&f->nh_list);
1002 1003 1004
	call_rcu(&f->rcu, vxlan_fdb_free);
}

1005 1006 1007 1008 1009 1010 1011 1012
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

1013 1014 1015 1016 1017
static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
				     union vxlan_addr *ip,
				     __u16 state, __u16 flags,
				     __be16 port, __be32 vni,
				     __u32 ifindex, __u16 ndm_flags,
1018
				     struct vxlan_fdb *f, u32 nhid,
1019 1020
				     bool swdev_notify,
				     struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
1021
{
P
Petr Machata 已提交
1022
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
1023
	struct vxlan_rdst *rd = NULL;
1024
	struct vxlan_rdst oldrd;
S
stephen hemminger 已提交
1025
	int notify = 0;
1026 1027
	int rc = 0;
	int err;
S
stephen hemminger 已提交
1028

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
	if (nhid && !rcu_access_pointer(f->nh)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot replace an existing non nexthop fdb with a nexthop");
		return -EOPNOTSUPP;
	}

	if (nhid && (flags & NLM_F_APPEND)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot append to a nexthop fdb");
		return -EOPNOTSUPP;
	}

1041 1042 1043 1044 1045 1046 1047 1048 1049
	/* Do not allow an externally learned entry to take over an entry added
	 * by the user.
	 */
	if (!(fdb_flags & NTF_EXT_LEARNED) ||
	    !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
		if (f->state != state) {
			f->state = state;
			f->updated = jiffies;
			notify = 1;
1050
		}
1051 1052 1053 1054
		if (f->flags != fdb_flags) {
			f->flags = fdb_flags;
			f->updated = jiffies;
			notify = 1;
1055
		}
1056
	}
1057

1058 1059 1060 1061
	if ((flags & NLM_F_REPLACE)) {
		/* Only change unicasts */
		if (!(is_multicast_ether_addr(f->eth_addr) ||
		      is_zero_ether_addr(f->eth_addr))) {
1062 1063 1064 1065 1066 1067 1068 1069
			if (nhid) {
				rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
				if (rc < 0)
					return rc;
			} else {
				rc = vxlan_fdb_replace(f, ip, port, vni,
						       ifindex, &oldrd);
			}
1070
			notify |= rc;
1071
		} else {
1072
			NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
1073
			return -EOPNOTSUPP;
1074 1075 1076 1077 1078 1079
		}
	}
	if ((flags & NLM_F_APPEND) &&
	    (is_multicast_ether_addr(f->eth_addr) ||
	     is_zero_ether_addr(f->eth_addr))) {
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
1080

1081
		if (rc < 0)
1082
			return rc;
1083
		notify |= rc;
S
stephen hemminger 已提交
1084 1085
	}

1086 1087 1088
	if (ndm_flags & NTF_USE)
		f->used = jiffies;

1089 1090 1091
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
1092

1093
		err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
1094
				       swdev_notify, extack);
1095 1096
		if (err)
			goto err_notify;
1097
	}
S
stephen hemminger 已提交
1098 1099

	return 0;
1100 1101

err_notify:
1102 1103
	if (nhid)
		return err;
1104 1105
	if ((flags & NLM_F_REPLACE) && rc)
		*rd = oldrd;
1106
	else if ((flags & NLM_F_APPEND) && rc) {
1107
		list_del_rcu(&rd->list);
1108 1109
		call_rcu(&rd->rcu, vxlan_dst_free);
	}
1110
	return err;
S
stephen hemminger 已提交
1111 1112
}

1113 1114 1115 1116
static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
				   const u8 *mac, union vxlan_addr *ip,
				   __u16 state, __u16 flags,
				   __be16 port, __be32 src_vni, __be32 vni,
1117
				   __u32 ifindex, __u16 ndm_flags, u32 nhid,
1118 1119
				   bool swdev_notify,
				   struct netlink_ext_ack *extack)
1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131
{
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
	struct vxlan_fdb *f;
	int rc;

	/* Disallow replace to add a multicast entry */
	if ((flags & NLM_F_REPLACE) &&
	    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
		return -EOPNOTSUPP;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
	rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
1132
			      vni, ifindex, fdb_flags, nhid, &f, extack);
1133 1134 1135
	if (rc < 0)
		return rc;

1136
	vxlan_fdb_insert(vxlan, mac, src_vni, f);
1137
	rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
1138
			      swdev_notify, extack);
1139 1140 1141
	if (rc)
		goto err_notify;

1142
	return 0;
1143 1144 1145 1146

err_notify:
	vxlan_fdb_destroy(vxlan, f, false, false);
	return rc;
1147 1148 1149 1150 1151 1152 1153
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __u16 flags,
			    __be16 port, __be32 src_vni, __be32 vni,
1154
			    __u32 ifindex, __u16 ndm_flags, u32 nhid,
1155 1156
			    bool swdev_notify,
			    struct netlink_ext_ack *extack)
1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169
{
	struct vxlan_fdb *f;

	f = __vxlan_find_mac(vxlan, mac, src_vni);
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}

		return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
						 vni, ifindex, ndm_flags, f,
1170
						 nhid, swdev_notify, extack);
1171 1172 1173 1174 1175 1176
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

		return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
					       port, src_vni, vni, ifindex,
1177 1178
					       ndm_flags, nhid, swdev_notify,
					       extack);
1179 1180 1181
	}
}

1182
static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
1183
				  struct vxlan_rdst *rd, bool swdev_notify)
1184 1185
{
	list_del_rcu(&rd->list);
1186
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
1187 1188 1189
	call_rcu(&rd->rcu, vxlan_dst_free);
}

M
Mike Rapoport 已提交
1190
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
1191
			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
1192
			   __be32 *vni, u32 *ifindex, u32 *nhid)
S
stephen hemminger 已提交
1193
{
1194
	struct net *net = dev_net(vxlan->dev);
C
Cong Wang 已提交
1195
	int err;
S
stephen hemminger 已提交
1196

1197 1198 1199 1200
	if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] ||
	    tb[NDA_PORT]))
		return -EINVAL;

M
Mike Rapoport 已提交
1201
	if (tb[NDA_DST]) {
C
Cong Wang 已提交
1202 1203 1204
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
M
Mike Rapoport 已提交
1205
	} else {
C
Cong Wang 已提交
1206
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
1207

C
Cong Wang 已提交
1208 1209 1210 1211 1212 1213 1214 1215 1216
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
M
Mike Rapoport 已提交
1217
	}
S
stephen hemminger 已提交
1218

1219
	if (tb[NDA_PORT]) {
1220
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
1221
			return -EINVAL;
M
Mike Rapoport 已提交
1222 1223
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
1224
		*port = vxlan->cfg.dst_port;
M
Mike Rapoport 已提交
1225
	}
1226 1227 1228 1229

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
1230
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
M
Mike Rapoport 已提交
1231 1232 1233
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
1234

1235 1236 1237 1238 1239 1240 1241 1242
	if (tb[NDA_SRC_VNI]) {
		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
			return -EINVAL;
		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
	} else {
		*src_vni = vxlan->default_dst.remote_vni;
	}

1243
	if (tb[NDA_IFINDEX]) {
P
Pravin B Shelar 已提交
1244
		struct net_device *tdev;
1245 1246 1247

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
M
Mike Rapoport 已提交
1248
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1249
		tdev = __dev_get_by_index(net, *ifindex);
P
Pravin B Shelar 已提交
1250
		if (!tdev)
1251
			return -EADDRNOTAVAIL;
M
Mike Rapoport 已提交
1252 1253 1254 1255
	} else {
		*ifindex = 0;
	}

1256 1257 1258 1259 1260
	if (tb[NDA_NH_ID])
		*nhid = nla_get_u32(tb[NDA_NH_ID]);
	else
		*nhid = 0;

M
Mike Rapoport 已提交
1261 1262 1263 1264 1265 1266
	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
1267 1268
			 const unsigned char *addr, u16 vid, u16 flags,
			 struct netlink_ext_ack *extack)
M
Mike Rapoport 已提交
1269 1270 1271
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
C
Cong Wang 已提交
1272
	union vxlan_addr ip;
M
Mike Rapoport 已提交
1273
	__be16 port;
1274
	__be32 src_vni, vni;
1275
	u32 ifindex, nhid;
1276
	u32 hash_index;
M
Mike Rapoport 已提交
1277 1278 1279 1280 1281 1282 1283 1284
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

1285
	if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
M
Mike Rapoport 已提交
1286 1287
		return -EINVAL;

1288 1289
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
M
Mike Rapoport 已提交
1290 1291
	if (err)
		return err;
1292

1293 1294 1295
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

1296 1297
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1298
	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
P
Petr Machata 已提交
1299 1300
			       port, src_vni, vni, ifindex,
			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1301
			       nhid, true, extack);
1302
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1303 1304 1305 1306

	return err;
}

1307 1308
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
			      const unsigned char *addr, union vxlan_addr ip,
1309
			      __be16 port, __be32 src_vni, __be32 vni,
1310
			      u32 ifindex, bool swdev_notify)
S
stephen hemminger 已提交
1311
{
1312
	struct vxlan_rdst *rd = NULL;
1313
	struct vxlan_fdb *f;
1314
	int err = -ENOENT;
1315

1316
	f = vxlan_find_mac(vxlan, addr, src_vni);
1317
	if (!f)
1318
		return err;
1319

C
Cong Wang 已提交
1320 1321
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1322 1323 1324 1325 1326 1327 1328 1329
		if (!rd)
			goto out;
	}

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
1330
		vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1331
		goto out;
S
stephen hemminger 已提交
1332
	}
1333

1334
	vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1335 1336

out:
1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
	return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
			    const unsigned char *addr, u16 vid)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	union vxlan_addr ip;
	__be32 src_vni, vni;
1348
	u32 ifindex, nhid;
1349
	u32 hash_index;
1350
	__be16 port;
1351 1352
	int err;

1353 1354
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
1355 1356 1357
	if (err)
		return err;

1358 1359
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1360 1361
	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
				 true);
1362
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1363 1364 1365 1366 1367 1368

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1369
			  struct net_device *dev,
1370
			  struct net_device *filter_dev, int *idx)
S
stephen hemminger 已提交
1371 1372 1373
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
1374
	int err = 0;
S
stephen hemminger 已提交
1375 1376 1377 1378

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

1379
		rcu_read_lock();
1380
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1381 1382
			struct vxlan_rdst *rd;

1383
			if (rcu_access_pointer(f->nh)) {
1384 1385
				if (*idx < cb->args[2])
					goto skip_nh;
1386 1387 1388 1389 1390
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, NULL);
1391 1392
				if (err < 0) {
					rcu_read_unlock();
1393
					goto out;
1394
				}
1395 1396
skip_nh:
				*idx += 1;
1397 1398 1399
				continue;
			}

1400
			list_for_each_entry_rcu(rd, &f->remotes, list) {
1401
				if (*idx < cb->args[2])
1402 1403
					goto skip;

1404 1405 1406 1407 1408
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
1409 1410
				if (err < 0) {
					rcu_read_unlock();
1411
					goto out;
1412
				}
1413
skip:
1414
				*idx += 1;
1415
			}
S
stephen hemminger 已提交
1416
		}
1417
		rcu_read_unlock();
S
stephen hemminger 已提交
1418
	}
1419
out:
1420
	return err;
S
stephen hemminger 已提交
1421 1422
}

R
Roopa Prabhu 已提交
1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455
static int vxlan_fdb_get(struct sk_buff *skb,
			 struct nlattr *tb[],
			 struct net_device *dev,
			 const unsigned char *addr,
			 u16 vid, u32 portid, u32 seq,
			 struct netlink_ext_ack *extack)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	__be32 vni;
	int err;

	if (tb[NDA_VNI])
		vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
	else
		vni = vxlan->default_dst.remote_vni;

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, addr, vni);
	if (!f) {
		NL_SET_ERR_MSG(extack, "Fdb entry not found");
		err = -ENOENT;
		goto errout;
	}

	err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
			     RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
	rcu_read_unlock();
	return err;
}

S
stephen hemminger 已提交
1456 1457
/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
1458
 * Return true if packet is bogus and should be dropped.
S
stephen hemminger 已提交
1459
 */
1460
static bool vxlan_snoop(struct net_device *dev,
1461
			union vxlan_addr *src_ip, const u8 *src_mac,
1462
			u32 src_ifindex, __be32 vni)
S
stephen hemminger 已提交
1463 1464 1465
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
1466 1467 1468 1469 1470 1471 1472
	u32 ifindex = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (src_ip->sa.sa_family == AF_INET6 &&
	    (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
		ifindex = src_ifindex;
#endif
S
stephen hemminger 已提交
1473

1474
	f = vxlan_find_mac(vxlan, src_mac, vni);
S
stephen hemminger 已提交
1475
	if (likely(f)) {
1476
		struct vxlan_rdst *rdst = first_remote_rcu(f);
1477

1478 1479
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
			   rdst->remote_ifindex == ifindex))
1480 1481 1482
			return false;

		/* Don't migrate static entries, drop packets */
1483
		if (f->state & (NUD_PERMANENT | NUD_NOARP))
1484
			return true;
S
stephen hemminger 已提交
1485

1486 1487 1488 1489
		/* Don't override an fdb with nexthop with a learnt entry */
		if (rcu_access_pointer(f->nh))
			return true;

S
stephen hemminger 已提交
1490 1491
		if (net_ratelimit())
			netdev_info(dev,
C
Cong Wang 已提交
1492
				    "%pM migrated from %pIS to %pIS\n",
1493
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
S
stephen hemminger 已提交
1494

C
Cong Wang 已提交
1495
		rdst->remote_ip = *src_ip;
S
stephen hemminger 已提交
1496
		f->updated = jiffies;
1497
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
S
stephen hemminger 已提交
1498
	} else {
1499 1500
		u32 hash_index = fdb_head_index(vxlan, src_mac, vni);

S
stephen hemminger 已提交
1501
		/* learned new entry */
1502
		spin_lock(&vxlan->hash_lock[hash_index]);
1503 1504 1505

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
1506
			vxlan_fdb_update(vxlan, src_mac, src_ip,
1507 1508
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
1509
					 vxlan->cfg.dst_port,
1510
					 vni,
1511
					 vxlan->default_dst.remote_vni,
1512
					 ifindex, NTF_SELF, 0, true, NULL);
1513
		spin_unlock(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1514
	}
1515 1516

	return false;
S
stephen hemminger 已提交
1517 1518 1519
}

/* See if multicast group is already in use by other ID */
1520
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
S
stephen hemminger 已提交
1521
{
1522
	struct vxlan_dev *vxlan;
1523
	struct vxlan_sock *sock4;
A
Arnd Bergmann 已提交
1524 1525 1526
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
1527
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
S
stephen hemminger 已提交
1528

1529 1530
	sock4 = rtnl_dereference(dev->vn4_sock);

1531 1532 1533
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
1534
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1535
		return false;
1536
#if IS_ENABLED(CONFIG_IPV6)
1537
	sock6 = rtnl_dereference(dev->vn6_sock);
1538
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1539 1540
		return false;
#endif
1541

1542
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1543
		if (!netif_running(vxlan->dev) || vxlan == dev)
1544
			continue;
S
stephen hemminger 已提交
1545

1546 1547
		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
1548
			continue;
1549
#if IS_ENABLED(CONFIG_IPV6)
1550 1551
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
1552 1553
			continue;
#endif
1554 1555 1556 1557 1558 1559 1560 1561 1562 1563

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
1564
	}
S
stephen hemminger 已提交
1565 1566 1567 1568

	return false;
}

1569
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1570
{
1571
	struct vxlan_net *vn;
1572

1573
	if (!vs)
1574
		return false;
1575
	if (!refcount_dec_and_test(&vs->refcnt))
1576
		return false;
S
stephen hemminger 已提交
1577

1578
	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1579
	spin_lock(&vn->sock_lock);
1580
	hlist_del_rcu(&vs->hlist);
1581
	udp_tunnel_notify_del_rx_port(vs->sock,
1582 1583
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
1584
				      UDP_TUNNEL_TYPE_VXLAN);
1585 1586
	spin_unlock(&vn->sock_lock);

1587
	return true;
S
stephen hemminger 已提交
1588 1589
}

1590 1591
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
1592
	struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1593
#if IS_ENABLED(CONFIG_IPV6)
1594 1595
	struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

1596
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1597 1598
#endif

1599
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1600 1601
	synchronize_net();

1602 1603
	vxlan_vs_del_dev(vxlan);

1604 1605 1606
	if (__vxlan_sock_release_prep(sock4)) {
		udp_tunnel_sock_release(sock4->sock);
		kfree(sock4);
1607 1608 1609
	}

#if IS_ENABLED(CONFIG_IPV6)
1610 1611 1612
	if (__vxlan_sock_release_prep(sock6)) {
		udp_tunnel_sock_release(sock6->sock);
		kfree(sock6);
1613
	}
1614 1615 1616
#endif
}

1617
/* Update multicast group membership when first VNI on
1618
 * multicast address is brought up
1619
 */
1620
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1621
{
1622
	struct sock *sk;
C
Cong Wang 已提交
1623 1624
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1625
	int ret = -EINVAL;
S
stephen hemminger 已提交
1626

C
Cong Wang 已提交
1627
	if (ip->sa.sa_family == AF_INET) {
1628
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1629 1630 1631 1632 1633
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1634
		sk = sock4->sock->sk;
1635
		lock_sock(sk);
1636
		ret = ip_mc_join_group(sk, &mreq);
1637
		release_sock(sk);
C
Cong Wang 已提交
1638 1639
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1640 1641 1642
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1643
		lock_sock(sk);
1644 1645
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
1646
		release_sock(sk);
C
Cong Wang 已提交
1647 1648
#endif
	}
S
stephen hemminger 已提交
1649

1650
	return ret;
S
stephen hemminger 已提交
1651 1652 1653
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
1654
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1655
{
1656
	struct sock *sk;
C
Cong Wang 已提交
1657 1658
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1659
	int ret = -EINVAL;
S
stephen hemminger 已提交
1660

C
Cong Wang 已提交
1661
	if (ip->sa.sa_family == AF_INET) {
1662
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1663 1664 1665 1666 1667
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1668
		sk = sock4->sock->sk;
1669
		lock_sock(sk);
1670
		ret = ip_mc_leave_group(sk, &mreq);
1671
		release_sock(sk);
C
Cong Wang 已提交
1672 1673
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1674 1675 1676
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1677
		lock_sock(sk);
1678 1679
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
1680
		release_sock(sk);
C
Cong Wang 已提交
1681 1682
#endif
	}
S
stephen hemminger 已提交
1683

1684
	return ret;
S
stephen hemminger 已提交
1685 1686
}

1687 1688
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
T
Tom Herbert 已提交
1689
{
1690
	size_t start, offset;
T
Tom Herbert 已提交
1691

1692 1693
	if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
		goto out;
1694

1695 1696
	start = vxlan_rco_start(unparsed->vx_vni);
	offset = start + vxlan_rco_offset(unparsed->vx_vni);
T
Tom Herbert 已提交
1697

1698
	if (!pskb_may_pull(skb, offset + sizeof(u16)))
J
Jiri Benc 已提交
1699
		return false;
T
Tom Herbert 已提交
1700

J
Jiri Benc 已提交
1701 1702
	skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
			    !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1703 1704 1705
out:
	unparsed->vx_flags &= ~VXLAN_HF_RCO;
	unparsed->vx_vni &= VXLAN_VNI_MASK;
J
Jiri Benc 已提交
1706
	return true;
T
Tom Herbert 已提交
1707 1708
}

1709
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1710
				struct sk_buff *skb, u32 vxflags,
1711
				struct vxlan_metadata *md)
1712
{
1713
	struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1714
	struct metadata_dst *tun_dst;
1715 1716 1717

	if (!(unparsed->vx_flags & VXLAN_HF_GBP))
		goto out;
1718 1719 1720

	md->gbp = ntohs(gbp->policy_id);

1721
	tun_dst = (struct metadata_dst *)skb_dst(skb);
1722
	if (tun_dst) {
1723
		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1724 1725
		tun_dst->u.tun_info.options_len = sizeof(*md);
	}
1726 1727 1728 1729 1730
	if (gbp->dont_learn)
		md->gbp |= VXLAN_GBP_DONT_LEARN;

	if (gbp->policy_applied)
		md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1731

1732 1733 1734
	/* In flow-based mode, GBP is carried in dst_metadata */
	if (!(vxflags & VXLAN_F_COLLECT_METADATA))
		skb->mark = md->gbp;
1735 1736
out:
	unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1737 1738
}

J
Jiri Benc 已提交
1739
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
J
Jiri Benc 已提交
1740
				__be16 *protocol,
J
Jiri Benc 已提交
1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759
				struct sk_buff *skb, u32 vxflags)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;

	/* Need to have Next Protocol set for interfaces in GPE mode. */
	if (!gpe->np_applied)
		return false;
	/* "The initial version is 0. If a receiver does not support the
	 * version indicated it MUST drop the packet.
	 */
	if (gpe->version != 0)
		return false;
	/* "When the O bit is set to 1, the packet is an OAM packet and OAM
	 * processing MUST occur." However, we don't implement OAM
	 * processing, thus drop the packet.
	 */
	if (gpe->oam_flag)
		return false;

1760 1761
	*protocol = tun_p_to_eth_p(gpe->next_protocol);
	if (!*protocol)
J
Jiri Benc 已提交
1762 1763 1764 1765 1766 1767
		return false;

	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
	return true;
}

1768 1769
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
			  struct vxlan_sock *vs,
1770
			  struct sk_buff *skb, __be32 vni)
1771 1772
{
	union vxlan_addr saddr;
1773
	u32 ifindex = skb->dev->ifindex;
1774 1775 1776 1777 1778 1779 1780

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, vxlan->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1781
		return false;
1782

1783
	/* Get address from the outer IP header */
1784
	if (vxlan_get_sk_family(vs) == AF_INET) {
1785
		saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1786 1787 1788
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1789
		saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1790 1791 1792 1793
		saddr.sa.sa_family = AF_INET6;
#endif
	}

1794
	if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1795
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1796 1797 1798 1799 1800
		return false;

	return true;
}

1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
				  struct sk_buff *skb)
{
	int err = 0;

	if (vxlan_get_sk_family(vs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
	else
		err = IP6_ECN_decapsulate(oiph, skb);
#endif

	if (unlikely(err) && log_ecn_error) {
		if (vxlan_get_sk_family(vs) == AF_INET)
			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
					     &((struct iphdr *)oiph)->saddr,
					     ((struct iphdr *)oiph)->tos);
		else
			net_info_ratelimited("non-ECT from %pI6\n",
					     &((struct ipv6hdr *)oiph)->saddr);
	}
	return err <= 1;
}

S
stephen hemminger 已提交
1825
/* Callback from net/ipv4/udp.c to receive packets */
1826
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
S
stephen hemminger 已提交
1827
{
1828
	struct pcpu_sw_netstats *stats;
1829
	struct vxlan_dev *vxlan;
P
Pravin B Shelar 已提交
1830
	struct vxlan_sock *vs;
1831
	struct vxlanhdr unparsed;
T
Thomas Graf 已提交
1832 1833
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
J
Jiri Benc 已提交
1834
	__be16 protocol = htons(ETH_P_TEB);
J
Jiri Benc 已提交
1835
	bool raw_proto = false;
1836
	void *oiph;
1837
	__be32 vni = 0;
S
stephen hemminger 已提交
1838

J
Jiri Benc 已提交
1839
	/* Need UDP and VXLAN header to be present */
1840
	if (!pskb_may_pull(skb, VXLAN_HLEN))
1841
		goto drop;
S
stephen hemminger 已提交
1842

1843
	unparsed = *vxlan_hdr(skb);
J
Jiri Benc 已提交
1844 1845 1846 1847 1848 1849
	/* VNI flag always required to be set */
	if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
			   ntohl(vxlan_hdr(skb)->vx_flags),
			   ntohl(vxlan_hdr(skb)->vx_vni));
		/* Return non vxlan pkt */
1850
		goto drop;
S
stephen hemminger 已提交
1851
	}
J
Jiri Benc 已提交
1852 1853
	unparsed.vx_flags &= ~VXLAN_HF_VNI;
	unparsed.vx_vni &= ~VXLAN_VNI_MASK;
S
stephen hemminger 已提交
1854

1855
	vs = rcu_dereference_sk_user_data(sk);
P
Pravin B Shelar 已提交
1856
	if (!vs)
S
stephen hemminger 已提交
1857 1858
		goto drop;

1859 1860
	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

1861
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1862 1863 1864
	if (!vxlan)
		goto drop;

J
Jiri Benc 已提交
1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if (vs->flags & VXLAN_F_GPE) {
		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
			goto drop;
		raw_proto = true;
	}

	if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
				   !net_eq(vxlan->net, dev_net(vxlan->dev))))
1876
		goto drop;
1877

T
Thomas Graf 已提交
1878
	if (vxlan_collect_metadata(vs)) {
1879
		struct metadata_dst *tun_dst;
J
Jiri Benc 已提交
1880

1881
		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1882
					 key32_to_tunnel_id(vni), sizeof(*md));
1883

T
Thomas Graf 已提交
1884 1885 1886
		if (!tun_dst)
			goto drop;

1887
		md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1888 1889

		skb_dst_set(skb, (struct dst_entry *)tun_dst);
T
Thomas Graf 已提交
1890 1891 1892 1893
	} else {
		memset(md, 0, sizeof(*md));
	}

1894 1895 1896 1897
	if (vs->flags & VXLAN_F_REMCSUM_RX)
		if (!vxlan_remcsum(&unparsed, skb, vs->flags))
			goto drop;
	if (vs->flags & VXLAN_F_GBP)
1898
		vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
J
Jiri Benc 已提交
1899 1900 1901
	/* Note that GBP and GPE can never be active together. This is
	 * ensured in vxlan_dev_configure.
	 */
T
Thomas Graf 已提交
1902

1903
	if (unparsed.vx_flags || unparsed.vx_vni) {
1904 1905 1906 1907
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
1908
		 * maintains compatibility with previous stack code, and also
1909 1910 1911
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */
J
Jiri Benc 已提交
1912
		goto drop;
1913 1914
	}

J
Jiri Benc 已提交
1915
	if (!raw_proto) {
1916
		if (!vxlan_set_mac(vxlan, vs, skb, vni))
J
Jiri Benc 已提交
1917 1918
			goto drop;
	} else {
1919
		skb_reset_mac_header(skb);
J
Jiri Benc 已提交
1920 1921 1922
		skb->dev = vxlan->dev;
		skb->pkt_type = PACKET_HOST;
	}
1923 1924 1925 1926 1927 1928 1929 1930 1931 1932

	oiph = skb_network_header(skb);
	skb_reset_network_header(skb);

	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		goto drop;
	}

1933 1934 1935 1936 1937 1938 1939 1940
	rcu_read_lock();

	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
		rcu_read_unlock();
		atomic_long_inc(&vxlan->dev->rx_dropped);
		goto drop;
	}

1941 1942 1943 1944 1945 1946 1947
	stats = this_cpu_ptr(vxlan->dev->tstats);
	u64_stats_update_begin(&stats->syncp);
	stats->rx_packets++;
	stats->rx_bytes += skb->len;
	u64_stats_update_end(&stats->syncp);

	gro_cells_receive(&vxlan->gro_cells, skb);
1948 1949 1950

	rcu_read_unlock();

P
Pravin B Shelar 已提交
1951 1952 1953
	return 0;

drop:
J
Jiri Benc 已提交
1954 1955 1956
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
P
Pravin B Shelar 已提交
1957 1958
}

S
Stefano Brivio 已提交
1959 1960 1961 1962 1963 1964 1965 1966
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr *hdr;
	__be32 vni;

1967
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
S
Stefano Brivio 已提交
1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
		return -EINVAL;

	hdr = vxlan_hdr(skb);

	if (!(hdr->vx_flags & VXLAN_HF_VNI))
		return -EINVAL;

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	if (!vxlan)
		return -ENOENT;

	return 0;
}

1987
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
D
David Stevens 已提交
1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;
	struct neighbour *n;

	if (dev->flags & IFF_NOARP)
		goto out;

	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
		dev->stats.tx_dropped++;
		goto out;
	}
	parp = arp_hdr(skb);

	if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
	     parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
	    parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		goto out;
	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		goto out;

	n = neigh_lookup(&arp_tbl, &tip, dev);

	if (n) {
		struct vxlan_fdb *f;
		struct sk_buff	*reply;

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2034
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2035
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
D
David Stevens 已提交
2036 2037 2038 2039 2040 2041 2042 2043 2044 2045
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

		reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
				n->ha, sha);

		neigh_release(n);

2046 2047 2048
		if (reply == NULL)
			goto out;

D
David Stevens 已提交
2049 2050 2051 2052 2053 2054 2055
		skb_reset_mac_header(reply);
		__skb_pull(reply, skb_network_offset(reply));
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;
2056
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
C
Cong Wang 已提交
2057 2058
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
2059
			.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2060 2061 2062 2063
		};

		vxlan_ip_miss(dev, &ipa);
	}
D
David Stevens 已提交
2064 2065 2066 2067 2068
out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}

C
Cong Wang 已提交
2069
#if IS_ENABLED(CONFIG_IPV6)
2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
	struct neighbour *n, bool isrouter)
{
	struct net_device *dev = request->dev;
	struct sk_buff *reply;
	struct nd_msg *ns, *na;
	struct ipv6hdr *pip6;
	u8 *daddr;
	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
	int ns_olen;
	int i, len;

2082
	if (dev == NULL || !pskb_may_pull(request, request->len))
2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094
		return NULL;

	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
		sizeof(*na) + na_olen + dev->needed_tailroom;
	reply = alloc_skb(len, GFP_ATOMIC);
	if (reply == NULL)
		return NULL;

	reply->protocol = htons(ETH_P_IPV6);
	reply->dev = dev;
	skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
	skb_push(reply, sizeof(struct ethhdr));
2095
	skb_reset_mac_header(reply);
2096

2097
	ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
2098 2099

	daddr = eth_hdr(request)->h_source;
2100 2101
	ns_olen = request->len - skb_network_offset(request) -
		sizeof(struct ipv6hdr) - sizeof(*ns);
2102
	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
2103 2104 2105 2106
		if (!ns->opt[i + 1]) {
			kfree_skb(reply);
			return NULL;
		}
2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119
		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
			break;
		}
	}

	/* Ethernet header */
	ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
	ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
	eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
	reply->protocol = htons(ETH_P_IPV6);

	skb_pull(reply, sizeof(struct ethhdr));
2120
	skb_reset_network_header(reply);
2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134
	skb_put(reply, sizeof(struct ipv6hdr));

	/* IPv6 header */

	pip6 = ipv6_hdr(reply);
	memset(pip6, 0, sizeof(struct ipv6hdr));
	pip6->version = 6;
	pip6->priority = ipv6_hdr(request)->priority;
	pip6->nexthdr = IPPROTO_ICMPV6;
	pip6->hop_limit = 255;
	pip6->daddr = ipv6_hdr(request)->saddr;
	pip6->saddr = *(struct in6_addr *)n->primary_key;

	skb_pull(reply, sizeof(struct ipv6hdr));
2135
	skb_reset_transport_header(reply);
2136 2137

	/* Neighbor Advertisement */
2138
	na = skb_put_zero(reply, sizeof(*na) + na_olen);
2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160
	na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
	na->icmph.icmp6_router = isrouter;
	na->icmph.icmp6_override = 1;
	na->icmph.icmp6_solicited = 1;
	na->target = ns->target;
	ether_addr_copy(&na->opt[2], n->ha);
	na->opt[0] = ND_OPT_TARGET_LL_ADDR;
	na->opt[1] = na_olen >> 3;

	na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
		&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
		csum_partial(na, sizeof(*na)+na_olen, 0));

	pip6->payload_len = htons(sizeof(*na)+na_olen);

	skb_push(reply, sizeof(struct ipv6hdr));

	reply->ip_summed = CHECKSUM_UNNECESSARY;

	return reply;
}

2161
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
C
Cong Wang 已提交
2162 2163
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2164
	const struct in6_addr *daddr;
2165
	const struct ipv6hdr *iphdr;
2166
	struct inet6_dev *in6_dev;
2167 2168
	struct neighbour *n;
	struct nd_msg *msg;
C
Cong Wang 已提交
2169 2170 2171 2172 2173 2174 2175

	in6_dev = __in6_dev_get(dev);
	if (!in6_dev)
		goto out;

	iphdr = ipv6_hdr(skb);
	daddr = &iphdr->daddr;
2176
	msg = (struct nd_msg *)(iphdr + 1);
C
Cong Wang 已提交
2177

2178 2179 2180 2181 2182
	if (ipv6_addr_loopback(daddr) ||
	    ipv6_addr_is_multicast(&msg->target))
		goto out;

	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
C
Cong Wang 已提交
2183 2184 2185

	if (n) {
		struct vxlan_fdb *f;
2186
		struct sk_buff *reply;
C
Cong Wang 已提交
2187 2188 2189 2190 2191 2192

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2193
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2194 2195 2196 2197 2198 2199
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

2200 2201 2202
		reply = vxlan_na_create(skb, n,
					!!(f ? f->flags & NTF_ROUTER : 0));

C
Cong Wang 已提交
2203
		neigh_release(n);
2204 2205 2206 2207 2208 2209 2210

		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;

2211
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
2212 2213
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
2214
			.sin6.sin6_family = AF_INET6,
2215 2216
		};

C
Cong Wang 已提交
2217 2218 2219 2220 2221 2222 2223 2224 2225
		vxlan_ip_miss(dev, &ipa);
	}

out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}
#endif

D
David Stevens 已提交
2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct neighbour *n;

	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
		return false;

	n = NULL;
	switch (ntohs(eth_hdr(skb)->h_proto)) {
	case ETH_P_IP:
2237 2238 2239
	{
		struct iphdr *pip;

D
David Stevens 已提交
2240 2241 2242 2243
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			return false;
		pip = ip_hdr(skb);
		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
2244
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
C
Cong Wang 已提交
2245 2246
			union vxlan_addr ipa = {
				.sin.sin_addr.s_addr = pip->daddr,
2247
				.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2248 2249 2250 2251 2252 2253
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

D
David Stevens 已提交
2254
		break;
2255 2256 2257 2258 2259 2260 2261 2262 2263 2264
	}
#if IS_ENABLED(CONFIG_IPV6)
	case ETH_P_IPV6:
	{
		struct ipv6hdr *pip6;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			return false;
		pip6 = ipv6_hdr(skb);
		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
2265
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2266 2267
			union vxlan_addr ipa = {
				.sin6.sin6_addr = pip6->daddr,
2268
				.sin6.sin6_family = AF_INET6,
2269 2270 2271 2272 2273 2274 2275 2276 2277
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

		break;
	}
#endif
D
David Stevens 已提交
2278 2279 2280 2281 2282 2283 2284
	default:
		return false;
	}

	if (n) {
		bool diff;

2285
		diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
D
David Stevens 已提交
2286 2287 2288 2289 2290 2291 2292
		if (diff) {
			memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
				dev->addr_len);
			memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
		}
		neigh_release(n);
		return diff;
C
Cong Wang 已提交
2293 2294
	}

D
David Stevens 已提交
2295 2296 2297
	return false;
}

2298
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
T
Thomas Graf 已提交
2299 2300 2301 2302
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

2303 2304 2305
	if (!md->gbp)
		return;

T
Thomas Graf 已提交
2306
	gbp = (struct vxlanhdr_gbp *)vxh;
2307
	vxh->vx_flags |= VXLAN_HF_GBP;
T
Thomas Graf 已提交
2308 2309 2310 2311 2312 2313 2314 2315 2316 2317

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

J
Jiri Benc 已提交
2318 2319 2320 2321 2322 2323
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
			       __be16 protocol)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

	gpe->np_applied = 1;
2324 2325 2326 2327
	gpe->next_protocol = tun_p_from_eth_p(protocol);
	if (!gpe->next_protocol)
		return -EPFNOSUPPORT;
	return 0;
J
Jiri Benc 已提交
2328 2329
}

2330 2331 2332
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
2333
			   bool udp_sum)
C
Cong Wang 已提交
2334 2335 2336 2337
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
T
Tom Herbert 已提交
2338
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
J
Jiri Benc 已提交
2339
	__be16 inner_protocol = htons(ETH_P_TEB);
T
Tom Herbert 已提交
2340

2341
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
T
Tom Herbert 已提交
2342 2343 2344 2345 2346 2347
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
2348
		     skb->csum_offset == offsetof(struct tcphdr, check)))
T
Tom Herbert 已提交
2349 2350
			type |= SKB_GSO_TUNNEL_REMCSUM;
	}
C
Cong Wang 已提交
2351 2352

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2353
			+ VXLAN_HLEN + iphdr_len;
2354 2355 2356

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
J
Jiri Benc 已提交
2357
	if (unlikely(err))
P
pravin shelar 已提交
2358
		return err;
2359

2360 2361
	err = iptunnel_handle_offloads(skb, type);
	if (err)
P
pravin shelar 已提交
2362
		return err;
2363

2364
	vxh = __skb_push(skb, sizeof(*vxh));
2365 2366
	vxh->vx_flags = VXLAN_HF_VNI;
	vxh->vx_vni = vxlan_vni_field(vni);
2367

T
Tom Herbert 已提交
2368
	if (type & SKB_GSO_TUNNEL_REMCSUM) {
2369
		unsigned int start;
T
Tom Herbert 已提交
2370

2371 2372 2373
		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
		vxh->vx_flags |= VXLAN_HF_RCO;
T
Tom Herbert 已提交
2374 2375 2376 2377 2378 2379 2380

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

2381 2382
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
J
Jiri Benc 已提交
2383 2384 2385
	if (vxflags & VXLAN_F_GPE) {
		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
		if (err < 0)
P
pravin shelar 已提交
2386
			return err;
J
Jiri Benc 已提交
2387 2388
		inner_protocol = skb->protocol;
	}
T
Thomas Graf 已提交
2389

J
Jiri Benc 已提交
2390
	skb_set_inner_protocol(skb, inner_protocol);
2391
	return 0;
2392 2393
}

2394 2395
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
				      struct vxlan_sock *sock4,
2396
				      struct sk_buff *skb, int oif, u8 tos,
2397
				      __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2398
				      struct dst_cache *dst_cache,
2399
				      const struct ip_tunnel_info *info)
2400
{
2401
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2402 2403 2404
	struct rtable *rt = NULL;
	struct flowi4 fl4;

2405 2406 2407
	if (!sock4)
		return ERR_PTR(-EIO);

2408 2409 2410
	if (tos && !info)
		use_cache = false;
	if (use_cache) {
2411 2412 2413 2414 2415
		rt = dst_cache_get_ip4(dst_cache, saddr);
		if (rt)
			return rt;
	}

2416 2417 2418 2419 2420 2421
	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
2422
	fl4.saddr = *saddr;
2423 2424
	fl4.fl4_dport = dport;
	fl4.fl4_sport = sport;
2425 2426

	rt = ip_route_output_key(vxlan->net, &fl4);
2427
	if (!IS_ERR(rt)) {
2428 2429 2430 2431 2432 2433
		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
			ip_rt_put(rt);
			return ERR_PTR(-ELOOP);
		}

2434
		*saddr = fl4.saddr;
2435 2436
		if (use_cache)
			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2437 2438 2439
	} else {
		netdev_dbg(dev, "no route to %pI4\n", &daddr);
		return ERR_PTR(-ENETUNREACH);
2440
	}
2441 2442 2443
	return rt;
}

2444 2445
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2446
					  struct net_device *dev,
2447
					  struct vxlan_sock *sock6,
2448
					  struct sk_buff *skb, int oif, u8 tos,
2449
					  __be32 label,
2450
					  const struct in6_addr *daddr,
2451
					  struct in6_addr *saddr,
2452
					  __be16 dport, __be16 sport,
2453 2454
					  struct dst_cache *dst_cache,
					  const struct ip_tunnel_info *info)
2455
{
2456
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2457 2458 2459
	struct dst_entry *ndst;
	struct flowi6 fl6;

2460 2461 2462
	if (!sock6)
		return ERR_PTR(-EIO);

2463 2464
	if (tos && !info)
		use_cache = false;
2465
	if (use_cache) {
2466 2467 2468 2469 2470
		ndst = dst_cache_get_ip6(dst_cache, saddr);
		if (ndst)
			return ndst;
	}

2471 2472 2473
	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
2474
	fl6.saddr = *saddr;
2475
	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2476 2477
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;
2478 2479
	fl6.fl6_dport = dport;
	fl6.fl6_sport = sport;
2480

2481 2482 2483
	ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk,
					       &fl6, NULL);
	if (unlikely(IS_ERR(ndst))) {
2484 2485 2486 2487 2488 2489 2490 2491 2492
		netdev_dbg(dev, "no route to %pI6\n", daddr);
		return ERR_PTR(-ENETUNREACH);
	}

	if (unlikely(ndst->dev == dev)) {
		netdev_dbg(dev, "circular route to %pI6\n", daddr);
		dst_release(ndst);
		return ERR_PTR(-ELOOP);
	}
2493 2494

	*saddr = fl6.saddr;
2495
	if (use_cache)
2496
		dst_cache_set_ip6(dst_cache, ndst, saddr);
2497 2498 2499 2500
	return ndst;
}
#endif

2501 2502
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2503 2504
			       struct vxlan_dev *dst_vxlan, __be32 vni,
			       bool snoop)
2505
{
2506
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
C
Cong Wang 已提交
2507 2508
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2509
	struct net_device *dev;
2510
	int len = skb->len;
2511

2512 2513
	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2514 2515 2516 2517 2518
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

C
Cong Wang 已提交
2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#endif
	}

2529 2530 2531 2532 2533 2534 2535
	rcu_read_lock();
	dev = skb->dev;
	if (unlikely(!(dev->flags & IFF_UP))) {
		kfree_skb(skb);
		goto drop;
	}

2536
	if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
2537
		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
2538 2539 2540

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
2541
	tx_stats->tx_bytes += len;
2542 2543 2544 2545 2546
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
2547
		rx_stats->rx_bytes += len;
2548 2549
		u64_stats_update_end(&rx_stats->syncp);
	} else {
2550
drop:
2551
		dev->stats.rx_dropped++;
2552
	}
2553
	rcu_read_unlock();
2554 2555
}

2556
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2557 2558 2559 2560
				 struct vxlan_dev *vxlan,
				 union vxlan_addr *daddr,
				 __be16 dst_port, int dst_ifindex, __be32 vni,
				 struct dst_entry *dst,
2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575
				 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
	 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
	 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
	 */
	BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
	/* Bypass encapsulation if the destination is local */
	if (rt_flags & RTCF_LOCAL &&
	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
		struct vxlan_dev *dst_vxlan;

		dst_release(dst);
2576
		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2577
					   daddr->sa.sa_family, dst_port,
2578
					   vxlan->cfg.flags);
2579 2580 2581 2582 2583 2584
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			kfree_skb(skb);

			return -ENOENT;
		}
2585
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
2586 2587 2588 2589 2590 2591
		return 1;
	}

	return 0;
}

2592
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2593 2594
			   __be32 default_vni, struct vxlan_rdst *rdst,
			   bool did_rsc)
S
stephen hemminger 已提交
2595
{
2596
	struct dst_cache *dst_cache;
2597
	struct ip_tunnel_info *info;
S
stephen hemminger 已提交
2598
	struct vxlan_dev *vxlan = netdev_priv(dev);
P
pravin shelar 已提交
2599
	const struct iphdr *old_iph = ip_hdr(skb);
C
Cong Wang 已提交
2600
	union vxlan_addr *dst;
2601
	union vxlan_addr remote_ip, local_ip;
T
Thomas Graf 已提交
2602 2603
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
C
Cong Wang 已提交
2604
	__be16 src_port = 0, dst_port;
2605
	struct dst_entry *ndst = NULL;
2606
	__be32 vni, label;
S
stephen hemminger 已提交
2607
	__u8 tos, ttl;
2608
	int ifindex;
2609
	int err;
2610
	u32 flags = vxlan->cfg.flags;
2611
	bool udp_sum = false;
2612
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
S
stephen hemminger 已提交
2613

2614
	info = skb_tunnel_info(skb);
2615

T
Thomas Graf 已提交
2616
	if (rdst) {
P
pravin shelar 已提交
2617 2618 2619 2620
		dst = &rdst->remote_ip;
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
2621 2622
				vxlan_encap_bypass(skb, vxlan, vxlan,
						   default_vni, true);
P
pravin shelar 已提交
2623 2624 2625 2626 2627
				return;
			}
			goto drop;
		}

2628
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2629
		vni = (rdst->remote_vni) ? : default_vni;
2630
		ifindex = rdst->remote_ifindex;
2631
		local_ip = vxlan->cfg.saddr;
2632
		dst_cache = &rdst->dst_cache;
P
pravin shelar 已提交
2633
		md->gbp = skb->mark;
H
Hangbin Liu 已提交
2634 2635 2636 2637 2638 2639 2640
		if (flags & VXLAN_F_TTL_INHERIT) {
			ttl = ip_tunnel_get_ttl(old_iph, skb);
		} else {
			ttl = vxlan->cfg.ttl;
			if (!ttl && vxlan_addr_multicast(dst))
				ttl = 1;
		}
P
pravin shelar 已提交
2641 2642 2643 2644 2645 2646 2647 2648 2649 2650

		tos = vxlan->cfg.tos;
		if (tos == 1)
			tos = ip_tunnel_get_dsfield(old_iph, skb);

		if (dst->sa.sa_family == AF_INET)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
		label = vxlan->cfg.label;
T
Thomas Graf 已提交
2651 2652 2653 2654 2655 2656
	} else {
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
2657
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2658
		if (remote_ip.sa.sa_family == AF_INET) {
2659
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2660 2661
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
2662
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2663 2664
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
T
Thomas Graf 已提交
2665
		dst = &remote_ip;
P
pravin shelar 已提交
2666 2667
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = tunnel_id_to_key32(info->key.tun_id);
2668
		ifindex = 0;
2669
		dst_cache = &info->dst_cache;
2670 2671 2672
		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
			if (info->options_len < sizeof(*md))
				goto drop;
P
pravin shelar 已提交
2673
			md = ip_tunnel_info_opts(info);
2674
		}
2675 2676
		ttl = info->key.ttl;
		tos = info->key.tos;
2677
		label = info->key.label;
2678
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2679
	}
P
pravin shelar 已提交
2680 2681
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);
2682

J
Jakub Kicinski 已提交
2683
	rcu_read_lock();
C
Cong Wang 已提交
2684
	if (dst->sa.sa_family == AF_INET) {
2685
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
P
pravin shelar 已提交
2686
		struct rtable *rt;
P
pravin shelar 已提交
2687
		__be16 df = 0;
2688

2689 2690 2691
		if (!ifindex)
			ifindex = sock4->sock->sk->sk_bound_dev_if;

2692
		rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2693
				     dst->sin.sin_addr.s_addr,
2694
				     &local_ip.sin.sin_addr.s_addr,
2695
				     dst_port, src_port,
2696
				     dst_cache, info);
2697 2698
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
P
pravin shelar 已提交
2699
			goto tx_error;
2700
		}
C
Cong Wang 已提交
2701

2702
		if (!info) {
2703
			/* Bypass encapsulation if the destination is local */
2704
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2705 2706
						    dst_port, ifindex, vni,
						    &rt->dst, rt->rt_flags);
2707
			if (err)
J
Jakub Kicinski 已提交
2708
				goto out_unlock;
2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719

			if (vxlan->cfg.df == VXLAN_DF_SET) {
				df = htons(IP_DF);
			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
				struct ethhdr *eth = eth_hdr(skb);

				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
				    (ntohs(eth->h_proto) == ETH_P_IP &&
				     old_iph->frag_off & htons(IP_DF)))
					df = htons(IP_DF);
			}
2720
		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2721
			df = htons(IP_DF);
2722
		}
2723

P
pravin shelar 已提交
2724
		ndst = &rt->dst;
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
				struct in_addr src, dst;

				src = remote_ip.sin.sin_addr;
				dst = local_ip.sin.sin_addr;
				info->key.u.ipv4.src = src.s_addr;
				info->key.u.ipv4.dst = dst.s_addr;
			}
			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}
X
Xin Long 已提交
2742

2743
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
C
Cong Wang 已提交
2744
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
P
pravin shelar 已提交
2745
		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2746
				      vni, md, flags, udp_sum);
2747
		if (err < 0)
P
pravin shelar 已提交
2748
			goto tx_error;
2749

2750
		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2751 2752
				    dst->sin.sin_addr.s_addr, tos, ttl, df,
				    src_port, dst_port, xnet, !udp_sum);
C
Cong Wang 已提交
2753 2754
#if IS_ENABLED(CONFIG_IPV6)
	} else {
2755
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
C
Cong Wang 已提交
2756

2757 2758 2759
		if (!ifindex)
			ifindex = sock6->sock->sk->sk_bound_dev_if;

2760
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2761
					label, &dst->sin6.sin6_addr,
2762
					&local_ip.sin6.sin6_addr,
2763
					dst_port, src_port,
2764
					dst_cache, info);
2765
		if (IS_ERR(ndst)) {
2766
			err = PTR_ERR(ndst);
P
pravin shelar 已提交
2767
			ndst = NULL;
2768
			goto tx_error;
C
Cong Wang 已提交
2769
		}
2770

2771 2772
		if (!info) {
			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2773

2774
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2775 2776
						    dst_port, ifindex, vni,
						    ndst, rt6i_flags);
2777
			if (err)
J
Jakub Kicinski 已提交
2778
				goto out_unlock;
2779
		}
2780

2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
				struct in6_addr src, dst;

				src = remote_ip.sin6.sin6_addr;
				dst = local_ip.sin6.sin6_addr;
				info->key.u.ipv6.src = src;
				info->key.u.ipv6.dst = dst;
			}

			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}
X
Xin Long 已提交
2799

2800
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
C
Cong Wang 已提交
2801
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
2802 2803
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2804
				      vni, md, flags, udp_sum);
P
pravin shelar 已提交
2805 2806 2807
		if (err < 0)
			goto tx_error;

P
pravin shelar 已提交
2808
		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2809
				     &local_ip.sin6.sin6_addr,
2810
				     &dst->sin6.sin6_addr, tos, ttl,
2811
				     label, src_port, dst_port, !udp_sum);
C
Cong Wang 已提交
2812 2813
#endif
	}
J
Jakub Kicinski 已提交
2814 2815
out_unlock:
	rcu_read_unlock();
2816
	return;
S
stephen hemminger 已提交
2817 2818 2819

drop:
	dev->stats.tx_dropped++;
P
pravin shelar 已提交
2820 2821
	dev_kfree_skb(skb);
	return;
S
stephen hemminger 已提交
2822 2823

tx_error:
J
Jakub Kicinski 已提交
2824
	rcu_read_unlock();
2825 2826 2827 2828
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
P
pravin shelar 已提交
2829
	dst_release(ndst);
S
stephen hemminger 已提交
2830
	dev->stats.tx_errors++;
P
pravin shelar 已提交
2831
	kfree_skb(skb);
S
stephen hemminger 已提交
2832 2833
}

2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
			  struct vxlan_fdb *f, __be32 vni, bool did_rsc)
{
	struct vxlan_rdst nh_rdst;
	struct nexthop *nh;
	bool do_xmit;
	u32 hash;

	memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
	hash = skb_get_hash(skb);

	rcu_read_lock();
	nh = rcu_dereference(f->nh);
	if (!nh) {
		rcu_read_unlock();
		goto drop;
	}
	do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
	rcu_read_unlock();

	if (likely(do_xmit))
		vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
	else
		goto drop;

	return;

drop:
	dev->stats.tx_dropped++;
	dev_kfree_skb(skb);
}

2866 2867 2868 2869 2870 2871 2872 2873 2874
/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2875
	struct vxlan_rdst *rdst, *fdst = NULL;
2876
	const struct ip_tunnel_info *info;
2877 2878
	bool did_rsc = false;
	struct vxlan_fdb *f;
2879
	struct ethhdr *eth;
2880
	__be32 vni = 0;
2881

2882
	info = skb_tunnel_info(skb);
2883

2884 2885
	skb_reset_mac_header(skb);

2886
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2887 2888 2889 2890 2891 2892 2893 2894 2895 2896
		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
		    info->mode & IP_TUNNEL_INFO_TX) {
			vni = tunnel_id_to_key32(info->key.tun_id);
		} else {
			if (info && info->mode & IP_TUNNEL_INFO_TX)
				vxlan_xmit_one(skb, dev, vni, NULL, false);
			else
				kfree_skb(skb);
			return NETDEV_TX_OK;
		}
2897 2898
	}

2899
	if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2900
		eth = eth_hdr(skb);
C
Cong Wang 已提交
2901
		if (ntohs(eth->h_proto) == ETH_P_ARP)
2902
			return arp_reduce(dev, skb, vni);
C
Cong Wang 已提交
2903
#if IS_ENABLED(CONFIG_IPV6)
2904 2905 2906 2907 2908 2909 2910 2911
		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
			 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
					    sizeof(struct nd_msg)) &&
			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

			if (m->icmph.icmp6_code == 0 &&
			    m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2912
				return neigh_reduce(dev, skb, vni);
C
Cong Wang 已提交
2913 2914 2915
		}
#endif
	}
2916

2917
	eth = eth_hdr(skb);
2918
	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2919 2920
	did_rsc = false;

2921
	if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2922 2923
	    (ntohs(eth->h_proto) == ETH_P_IP ||
	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
2924 2925
		did_rsc = route_shortcircuit(dev, skb);
		if (did_rsc)
2926
			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2927 2928
	}

2929
	if (f == NULL) {
2930
		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2931
		if (f == NULL) {
2932
			if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2933 2934 2935 2936
			    !is_multicast_ether_addr(eth->h_dest))
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
2937
			kfree_skb(skb);
2938 2939 2940
			return NETDEV_TX_OK;
		}
	}
2941

2942 2943 2944 2945 2946 2947
	if (rcu_access_pointer(f->nh)) {
		vxlan_xmit_nh(skb, dev, f,
			      (vni ? : vxlan->default_dst.remote_vni), did_rsc);
	} else {
		list_for_each_entry_rcu(rdst, &f->remotes, list) {
			struct sk_buff *skb1;
2948

2949 2950 2951 2952 2953 2954 2955
			if (!fdst) {
				fdst = rdst;
				continue;
			}
			skb1 = skb_clone(skb, GFP_ATOMIC);
			if (skb1)
				vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2956
		}
2957 2958 2959 2960
		if (fdst)
			vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
		else
			kfree_skb(skb);
2961 2962
	}

2963
	return NETDEV_TX_OK;
2964 2965
}

S
stephen hemminger 已提交
2966
/* Walk the forwarding table and purge stale entries */
2967
static void vxlan_cleanup(struct timer_list *t)
S
stephen hemminger 已提交
2968
{
2969
	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
S
stephen hemminger 已提交
2970 2971 2972 2973 2974 2975 2976 2977
	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
	unsigned int h;

	if (!netif_running(vxlan->dev))
		return;

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
2978

2979
		spin_lock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2980 2981 2982 2983 2984
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
			unsigned long timeout;

2985
			if (f->state & (NUD_PERMANENT | NUD_NOARP))
S
stephen hemminger 已提交
2986 2987
				continue;

2988 2989 2990
			if (f->flags & NTF_EXT_LEARNED)
				continue;

2991
			timeout = f->used + vxlan->cfg.age_interval * HZ;
S
stephen hemminger 已提交
2992 2993 2994 2995 2996
			if (time_before_eq(timeout, jiffies)) {
				netdev_dbg(vxlan->dev,
					   "garbage collect %pM\n",
					   f->eth_addr);
				f->state = NUD_STALE;
2997
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
2998 2999 3000
			} else if (time_before(timeout, next_timer))
				next_timer = timeout;
		}
3001
		spin_unlock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3002 3003 3004 3005 3006
	}

	mod_timer(&vxlan->age_timer, next_timer);
}

3007 3008 3009 3010 3011
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
3012 3013 3014 3015
	hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
	hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
3016 3017 3018
	spin_unlock(&vn->sock_lock);
}

J
Jiri Benc 已提交
3019 3020
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
			     struct vxlan_dev_node *node)
3021
{
3022
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3023
	__be32 vni = vxlan->default_dst.remote_vni;
3024

J
Jiri Benc 已提交
3025
	node->vxlan = vxlan;
3026
	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
3027
	hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
3028
	spin_unlock(&vn->sock_lock);
3029 3030
}

S
stephen hemminger 已提交
3031 3032 3033
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
3034 3035 3036
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

3037
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
3038
	if (!dev->tstats)
S
stephen hemminger 已提交
3039 3040
		return -ENOMEM;

3041 3042 3043 3044 3045 3046
	err = gro_cells_init(&vxlan->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

S
stephen hemminger 已提交
3047 3048 3049
	return 0;
}

3050
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
3051 3052
{
	struct vxlan_fdb *f;
3053
	u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
3054

3055
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
3056
	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
3057
	if (f)
3058
		vxlan_fdb_destroy(vxlan, f, true, true);
3059
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
3060 3061
}

3062 3063 3064 3065
static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

3066 3067
	gro_cells_destroy(&vxlan->gro_cells);

3068
	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
3069

3070 3071 3072
	free_percpu(dev->tstats);
}

S
stephen hemminger 已提交
3073 3074 3075 3076
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3077
	int ret;
3078

3079 3080 3081
	ret = vxlan_sock_add(vxlan);
	if (ret < 0)
		return ret;
S
stephen hemminger 已提交
3082

3083
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
3084
		ret = vxlan_igmp_join(vxlan);
3085 3086
		if (ret == -EADDRINUSE)
			ret = 0;
3087
		if (ret) {
3088
			vxlan_sock_release(vxlan);
3089 3090
			return ret;
		}
S
stephen hemminger 已提交
3091 3092
	}

3093
	if (vxlan->cfg.age_interval)
S
stephen hemminger 已提交
3094 3095
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

3096
	return ret;
S
stephen hemminger 已提交
3097 3098 3099
}

/* Purge the forwarding table */
3100
static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
S
stephen hemminger 已提交
3101
{
3102
	unsigned int h;
S
stephen hemminger 已提交
3103 3104 3105

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
3106 3107

		spin_lock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3108 3109 3110
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
3111 3112
			if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
				continue;
3113
			/* the all_zeros_mac entry is deleted at vxlan_uninit */
T
Taehee Yoo 已提交
3114 3115 3116 3117
			if (is_zero_ether_addr(f->eth_addr) &&
			    f->vni == vxlan->cfg.vni)
				continue;
			vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
3118
		}
3119
		spin_unlock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3120 3121 3122 3123 3124 3125 3126
	}
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
N
Nicolas Dichtel 已提交
3127
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3128
	int ret = 0;
S
stephen hemminger 已提交
3129

3130
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
3131
	    !vxlan_group_used(vn, vxlan))
3132
		ret = vxlan_igmp_leave(vxlan);
S
stephen hemminger 已提交
3133 3134 3135

	del_timer_sync(&vxlan->age_timer);

3136
	vxlan_flush(vxlan, false);
3137
	vxlan_sock_release(vxlan);
S
stephen hemminger 已提交
3138

3139
	return ret;
S
stephen hemminger 已提交
3140 3141 3142 3143 3144 3145 3146
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

3147
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
3148
{
3149 3150 3151 3152
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);
3153
	bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
3154

3155 3156 3157 3158 3159 3160 3161
	/* This check is different than dev->max_mtu, because it looks at
	 * the lowerdev->mtu, rather than the static dev->max_mtu
	 */
	if (lowerdev) {
		int max_mtu = lowerdev->mtu -
			      (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
		if (new_mtu > max_mtu)
D
David Wragg 已提交
3162 3163 3164
			return -EINVAL;
	}

3165 3166 3167 3168
	dev->mtu = new_mtu;
	return 0;
}

3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

3179
	if (ip_tunnel_info_af(info) == AF_INET) {
3180
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
3181 3182
		struct rtable *rt;

3183
		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
3184
				     info->key.u.ipv4.dst,
3185 3186
				     &info->key.u.ipv4.src, dport, sport,
				     &info->dst_cache, info);
3187 3188 3189
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
3190 3191
	} else {
#if IS_ENABLED(CONFIG_IPV6)
3192
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
3193 3194
		struct dst_entry *ndst;

3195
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
3196
					info->key.label, &info->key.u.ipv6.dst,
3197 3198
					&info->key.u.ipv6.src, dport, sport,
					&info->dst_cache, info);
3199 3200 3201 3202 3203 3204 3205
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
		return -EPFNOSUPPORT;
#endif
	}
3206 3207
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
3208
	return 0;
3209 3210
}

3211
static const struct net_device_ops vxlan_netdev_ether_ops = {
S
stephen hemminger 已提交
3212
	.ndo_init		= vxlan_init,
3213
	.ndo_uninit		= vxlan_uninit,
S
stephen hemminger 已提交
3214 3215 3216
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
3217
	.ndo_get_stats64	= ip_tunnel_get_stats64,
S
stephen hemminger 已提交
3218
	.ndo_set_rx_mode	= vxlan_set_multicast_list,
3219
	.ndo_change_mtu		= vxlan_change_mtu,
S
stephen hemminger 已提交
3220 3221 3222 3223 3224
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_fdb_add		= vxlan_fdb_add,
	.ndo_fdb_del		= vxlan_fdb_delete,
	.ndo_fdb_dump		= vxlan_fdb_dump,
R
Roopa Prabhu 已提交
3225
	.ndo_fdb_get		= vxlan_fdb_get,
3226
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
3227
	.ndo_change_proto_down  = dev_change_proto_down_generic,
S
stephen hemminger 已提交
3228 3229
};

J
Jiri Benc 已提交
3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240
static const struct net_device_ops vxlan_netdev_raw_ops = {
	.ndo_init		= vxlan_init,
	.ndo_uninit		= vxlan_uninit,
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
	.ndo_get_stats64	= ip_tunnel_get_stats64,
	.ndo_change_mtu		= vxlan_change_mtu,
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
};

S
stephen hemminger 已提交
3241 3242 3243 3244 3245
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
	.name = "vxlan",
};

3246
/* Calls the ndo_udp_tunnel_add of the caller in order to
J
Joseph Gasparakis 已提交
3247
 * supply the listening VXLAN udp ports. Callers are expected
3248
 * to implement the ndo_udp_tunnel_add.
3249
 */
3250
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
3251 3252 3253 3254
{
	struct vxlan_sock *vs;
	struct net *net = dev_net(dev);
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
J
Joseph Gasparakis 已提交
3255
	unsigned int i;
3256 3257 3258

	spin_lock(&vn->sock_lock);
	for (i = 0; i < PORT_HASH_SIZE; ++i) {
3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271
		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
			unsigned short type;

			if (vs->flags & VXLAN_F_GPE)
				type = UDP_TUNNEL_TYPE_VXLAN_GPE;
			else
				type = UDP_TUNNEL_TYPE_VXLAN;

			if (push)
				udp_tunnel_push_rx_port(dev, vs->sock, type);
			else
				udp_tunnel_drop_rx_port(dev, vs->sock, type);
		}
3272 3273 3274 3275
	}
	spin_unlock(&vn->sock_lock);
}

S
stephen hemminger 已提交
3276 3277 3278 3279
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3280
	unsigned int h;
S
stephen hemminger 已提交
3281

3282 3283 3284
	eth_hw_addr_random(dev);
	ether_setup(dev);

3285
	dev->needs_free_netdev = true;
S
stephen hemminger 已提交
3286 3287 3288
	SET_NETDEV_DEVTYPE(dev, &vxlan_type);

	dev->features	|= NETIF_F_LLTX;
3289
	dev->features	|= NETIF_F_SG | NETIF_F_HW_CSUM;
3290
	dev->features   |= NETIF_F_RXCSUM;
3291
	dev->features   |= NETIF_F_GSO_SOFTWARE;
3292

3293
	dev->vlan_features = dev->features;
3294
	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
3295
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
3296
	netif_keep_dst(dev);
3297
	dev->priv_flags |= IFF_NO_QUEUE;
S
stephen hemminger 已提交
3298

3299 3300 3301 3302
	/* MTU range: 68 - 65535 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = ETH_MAX_MTU;

3303
	INIT_LIST_HEAD(&vxlan->next);
S
stephen hemminger 已提交
3304

3305
	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
S
stephen hemminger 已提交
3306 3307 3308

	vxlan->dev = dev;

3309 3310
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		spin_lock_init(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3311
		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
3312
	}
S
stephen hemminger 已提交
3313 3314
}

3315 3316 3317 3318 3319 3320 3321
static void vxlan_ether_setup(struct net_device *dev)
{
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
	dev->netdev_ops = &vxlan_netdev_ether_ops;
}

J
Jiri Benc 已提交
3322 3323
static void vxlan_raw_setup(struct net_device *dev)
{
3324
	dev->header_ops = NULL;
J
Jiri Benc 已提交
3325 3326 3327 3328 3329 3330 3331
	dev->type = ARPHRD_NONE;
	dev->hard_header_len = 0;
	dev->addr_len = 0;
	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
	dev->netdev_ops = &vxlan_netdev_raw_ops;
}

S
stephen hemminger 已提交
3332 3333
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
3334
	[IFLA_VXLAN_GROUP]	= { .len = sizeof_field(struct iphdr, daddr) },
C
Cong Wang 已提交
3335
	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3336
	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
3337
	[IFLA_VXLAN_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
C
Cong Wang 已提交
3338
	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3339 3340
	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
3341
	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
3342 3343 3344
	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
3345
	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
D
David Stevens 已提交
3346 3347 3348 3349
	[IFLA_VXLAN_PROXY]	= { .type = NLA_U8 },
	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
3350
	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
3351
	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
3352 3353 3354
	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
T
Tom Herbert 已提交
3355 3356
	[IFLA_VXLAN_REMCSUM_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_REMCSUM_RX]	= { .type = NLA_U8 },
T
Thomas Graf 已提交
3357
	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
J
Jiri Benc 已提交
3358
	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
3359
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
H
Hangbin Liu 已提交
3360
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
3361
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
S
stephen hemminger 已提交
3362 3363
};

3364 3365
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3366 3367 3368
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
3369 3370
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided link layer address is not Ethernet");
S
stephen hemminger 已提交
3371 3372 3373 3374
			return -EINVAL;
		}

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
3375 3376
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided Ethernet address is not unicast");
S
stephen hemminger 已提交
3377 3378 3379 3380
			return -EADDRNOTAVAIL;
		}
	}

3381
	if (tb[IFLA_MTU]) {
3382
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3383

3384 3385 3386
		if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "MTU must be between 68 and 65535");
3387
			return -EINVAL;
3388
		}
3389 3390
	}

3391 3392 3393
	if (!data) {
		NL_SET_ERR_MSG(extack,
			       "Required attributes not provided to perform the operation");
S
stephen hemminger 已提交
3394
		return -EINVAL;
3395
	}
S
stephen hemminger 已提交
3396 3397

	if (data[IFLA_VXLAN_ID]) {
3398 3399
		u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

3400
		if (id >= VXLAN_N_VID) {
3401
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
3402
					    "VXLAN ID must be lower than 16777216");
S
stephen hemminger 已提交
3403
			return -ERANGE;
3404
		}
S
stephen hemminger 已提交
3405 3406
	}

3407 3408 3409 3410 3411
	if (data[IFLA_VXLAN_PORT_RANGE]) {
		const struct ifla_vxlan_port_range *p
			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);

		if (ntohs(p->high) < ntohs(p->low)) {
3412
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
3413
					    "Invalid source port range");
3414 3415 3416 3417
			return -EINVAL;
		}
	}

3418 3419 3420 3421
	if (data[IFLA_VXLAN_DF]) {
		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

		if (df < 0 || df > VXLAN_DF_MAX) {
3422
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
3423 3424 3425 3426 3427
					    "Invalid DF attribute");
			return -EINVAL;
		}
	}

S
stephen hemminger 已提交
3428 3429 3430
	return 0;
}

Y
Yan Burman 已提交
3431 3432 3433 3434 3435 3436 3437
static void vxlan_get_drvinfo(struct net_device *netdev,
			      struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456
static int vxlan_get_link_ksettings(struct net_device *dev,
				    struct ethtool_link_ksettings *cmd)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);

	if (!lowerdev) {
		cmd->base.duplex = DUPLEX_UNKNOWN;
		cmd->base.port = PORT_OTHER;
		cmd->base.speed = SPEED_UNKNOWN;

		return 0;
	}

	return __ethtool_get_link_ksettings(lowerdev, cmd);
}

Y
Yan Burman 已提交
3457
static const struct ethtool_ops vxlan_ethtool_ops = {
3458 3459 3460
	.get_drvinfo		= vxlan_get_drvinfo,
	.get_link		= ethtool_op_get_link,
	.get_link_ksettings	= vxlan_get_link_ksettings,
Y
Yan Burman 已提交
3461 3462
};

T
Tom Herbert 已提交
3463
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3464
					__be16 port, u32 flags, int ifindex)
3465
{
C
Cong Wang 已提交
3466
	struct socket *sock;
T
Tom Herbert 已提交
3467 3468
	struct udp_port_cfg udp_conf;
	int err;
C
Cong Wang 已提交
3469

T
Tom Herbert 已提交
3470
	memset(&udp_conf, 0, sizeof(udp_conf));
C
Cong Wang 已提交
3471

T
Tom Herbert 已提交
3472 3473 3474
	if (ipv6) {
		udp_conf.family = AF_INET6;
		udp_conf.use_udp6_rx_checksums =
3475
		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3476
		udp_conf.ipv6_v6only = 1;
T
Tom Herbert 已提交
3477 3478
	} else {
		udp_conf.family = AF_INET;
C
Cong Wang 已提交
3479 3480
	}

T
Tom Herbert 已提交
3481
	udp_conf.local_udp_port = port;
3482
	udp_conf.bind_ifindex = ifindex;
3483

T
Tom Herbert 已提交
3484 3485 3486 3487
	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);
C
Cong Wang 已提交
3488

Z
Zhi Yong Wu 已提交
3489
	return sock;
C
Cong Wang 已提交
3490 3491 3492
}

/* Create new listen socket if needed */
3493
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3494 3495
					      __be16 port, u32 flags,
					      int ifindex)
C
Cong Wang 已提交
3496 3497 3498 3499 3500
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_sock *vs;
	struct socket *sock;
	unsigned int h;
3501
	struct udp_tunnel_sock_cfg tunnel_cfg;
C
Cong Wang 已提交
3502

3503
	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
C
Cong Wang 已提交
3504 3505 3506 3507 3508 3509
	if (!vs)
		return ERR_PTR(-ENOMEM);

	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vs->vni_list[h]);

3510
	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
Z
Zhi Yong Wu 已提交
3511
	if (IS_ERR(sock)) {
3512
		kfree(vs);
3513
		return ERR_CAST(sock);
3514
	}
C
Cong Wang 已提交
3515 3516

	vs->sock = sock;
3517
	refcount_set(&vs->refcnt, 1);
3518
	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3519

3520 3521
	spin_lock(&vn->sock_lock);
	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3522
	udp_tunnel_notify_add_rx_port(sock,
3523 3524
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
3525
				      UDP_TUNNEL_TYPE_VXLAN);
3526
	spin_unlock(&vn->sock_lock);
3527 3528

	/* Mark socket as an encapsulation socket. */
3529
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3530 3531
	tunnel_cfg.sk_user_data = vs;
	tunnel_cfg.encap_type = 1;
3532
	tunnel_cfg.encap_rcv = vxlan_rcv;
S
Stefano Brivio 已提交
3533
	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3534
	tunnel_cfg.encap_destroy = NULL;
3535 3536
	tunnel_cfg.gro_receive = vxlan_gro_receive;
	tunnel_cfg.gro_complete = vxlan_gro_complete;
3537 3538

	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
C
Cong Wang 已提交
3539

3540 3541 3542
	return vs;
}

3543
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3544
{
3545 3546
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	struct vxlan_sock *vs = NULL;
J
Jiri Benc 已提交
3547
	struct vxlan_dev_node *node;
3548 3549 3550 3551 3552
	int l3mdev_index = 0;

	if (vxlan->cfg.remote_ifindex)
		l3mdev_index = l3mdev_master_upper_ifindex_by_index(
			vxlan->net, vxlan->cfg.remote_ifindex);
3553

3554
	if (!vxlan->cfg.no_share) {
3555
		spin_lock(&vn->sock_lock);
3556
		vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3557 3558
				     vxlan->cfg.dst_port, vxlan->cfg.flags,
				     l3mdev_index);
3559
		if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3560
			spin_unlock(&vn->sock_lock);
3561
			return -EBUSY;
3562 3563 3564
		}
		spin_unlock(&vn->sock_lock);
	}
3565
	if (!vs)
3566
		vs = vxlan_socket_create(vxlan->net, ipv6,
3567 3568
					 vxlan->cfg.dst_port, vxlan->cfg.flags,
					 l3mdev_index);
3569 3570
	if (IS_ERR(vs))
		return PTR_ERR(vs);
3571
#if IS_ENABLED(CONFIG_IPV6)
J
Jiri Benc 已提交
3572
	if (ipv6) {
3573
		rcu_assign_pointer(vxlan->vn6_sock, vs);
J
Jiri Benc 已提交
3574 3575
		node = &vxlan->hlist6;
	} else
3576
#endif
J
Jiri Benc 已提交
3577
	{
3578
		rcu_assign_pointer(vxlan->vn4_sock, vs);
J
Jiri Benc 已提交
3579 3580 3581
		node = &vxlan->hlist4;
	}
	vxlan_vs_add_dev(vs, vxlan, node);
3582
	return 0;
3583 3584
}

3585 3586
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
3587 3588
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3589
	bool ipv4 = !ipv6 || metadata;
3590 3591
	int ret = 0;

3592
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3593
#if IS_ENABLED(CONFIG_IPV6)
3594
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3595
	if (ipv6) {
3596
		ret = __vxlan_sock_add(vxlan, true);
3597 3598 3599
		if (ret < 0 && ret != -EAFNOSUPPORT)
			ipv4 = false;
	}
3600
#endif
3601
	if (ipv4)
3602 3603 3604 3605 3606 3607
		ret = __vxlan_sock_add(vxlan, false);
	if (ret < 0)
		vxlan_sock_release(vxlan);
	return ret;
}

3608 3609
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
3610 3611
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3612
{
3613
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3614
	struct vxlan_dev *tmp;
C
Cong Wang 已提交
3615
	bool use_ipv6 = false;
S
stephen hemminger 已提交
3616

3617 3618 3619 3620 3621 3622 3623 3624
	if (conf->flags & VXLAN_F_GPE) {
		/* For now, allow GPE only together with
		 * COLLECT_METADATA. This can be relaxed later; in such
		 * case, the other side of the PtP link will have to be
		 * provided.
		 */
		if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3625 3626
			NL_SET_ERR_MSG(extack,
				       "VXLAN GPE does not support this combination of attributes");
3627
			return -EINVAL;
3628
		}
J
Jiri Benc 已提交
3629
	}
3630

3631 3632
	if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
		/* Unless IPv6 is explicitly requested, assume IPv4 */
3633
		conf->remote_ip.sa.sa_family = AF_INET;
3634 3635 3636 3637 3638 3639 3640
		conf->saddr.sa.sa_family = AF_INET;
	} else if (!conf->remote_ip.sa.sa_family) {
		conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
	} else if (!conf->saddr.sa.sa_family) {
		conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
	}

3641 3642 3643
	if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
		NL_SET_ERR_MSG(extack,
			       "Local and remote address must be from the same family");
3644
		return -EINVAL;
3645
	}
C
Cong Wang 已提交
3646

3647 3648
	if (vxlan_addr_multicast(&conf->saddr)) {
		NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3649
		return -EINVAL;
3650
	}
3651

3652
	if (conf->saddr.sa.sa_family == AF_INET6) {
3653 3654 3655
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG(extack,
				       "IPv6 support not enabled in the kernel");
3656
			return -EPFNOSUPPORT;
3657
		}
C
Cong Wang 已提交
3658
		use_ipv6 = true;
3659
		conf->flags |= VXLAN_F_IPV6;
3660 3661 3662 3663 3664 3665 3666 3667 3668

		if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			int local_type =
				ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
			int remote_type =
				ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

			if (local_type & IPV6_ADDR_LINKLOCAL) {
				if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3669 3670 3671
				    (remote_type != IPV6_ADDR_ANY)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3672
					return -EINVAL;
3673
				}
3674 3675 3676 3677

				conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
			} else {
				if (remote_type ==
3678 3679 3680
				    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3681
					return -EINVAL;
3682
				}
3683 3684 3685 3686

				conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
			}
		}
3687
	}
S
stephen hemminger 已提交
3688

3689 3690 3691
	if (conf->label && !use_ipv6) {
		NL_SET_ERR_MSG(extack,
			       "Label attribute only applies to IPv6 VXLAN devices");
3692
		return -EINVAL;
3693
	}
3694

3695 3696
	if (conf->remote_ifindex) {
		struct net_device *lowerdev;
3697

3698
		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3699 3700 3701
		if (!lowerdev) {
			NL_SET_ERR_MSG(extack,
				       "Invalid local interface, device not found");
3702
			return -ENODEV;
3703
		}
S
stephen hemminger 已提交
3704

C
Cong Wang 已提交
3705 3706 3707
#if IS_ENABLED(CONFIG_IPV6)
		if (use_ipv6) {
			struct inet6_dev *idev = __in6_dev_get(lowerdev);
3708 3709 3710
			if (idev && idev->cnf.disable_ipv6) {
				NL_SET_ERR_MSG(extack,
					       "IPv6 support disabled by administrator");
C
Cong Wang 已提交
3711
				return -EPERM;
3712
			}
C
Cong Wang 已提交
3713 3714 3715
		}
#endif

3716 3717
		*lower = lowerdev;
	} else {
3718 3719 3720 3721
		if (vxlan_addr_multicast(&conf->remote_ip)) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for multicast remote destination");

3722
			return -EINVAL;
3723
		}
3724

3725
#if IS_ENABLED(CONFIG_IPV6)
3726 3727 3728
		if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for link-local local/remote addresses");
3729
			return -EINVAL;
3730
		}
3731 3732
#endif

3733
		*lower = NULL;
J
Jiri Benc 已提交
3734
	}
S
stephen hemminger 已提交
3735

3736 3737 3738 3739 3740
	if (!conf->dst_port) {
		if (conf->flags & VXLAN_F_GPE)
			conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
		else
			conf->dst_port = htons(vxlan_port);
3741 3742
	}

3743 3744
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;
3745

3746 3747 3748
	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;
3749

3750 3751 3752 3753 3754
		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3755
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3756 3757 3758 3759 3760 3761
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

3762 3763
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
3764
		return -EEXIST;
3765
	}
3766

3767 3768 3769 3770 3771
	return 0;
}

static void vxlan_config_apply(struct net_device *dev,
			       struct vxlan_config *conf,
3772 3773 3774
			       struct net_device *lowerdev,
			       struct net *src_net,
			       bool changelink)
3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	unsigned short needed_headroom = ETH_HLEN;
	bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
	int max_mtu = ETH_MAX_MTU;

	if (!changelink) {
		if (conf->flags & VXLAN_F_GPE)
			vxlan_raw_setup(dev);
		else
			vxlan_ether_setup(dev);

		if (conf->mtu)
			dev->mtu = conf->mtu;
3790 3791

		vxlan->net = src_net;
3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802
	}

	dst->remote_vni = conf->vni;

	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

	if (lowerdev) {
		dst->remote_ifindex = conf->remote_ifindex;

		dev->gso_max_size = lowerdev->gso_max_size;
		dev->gso_max_segs = lowerdev->gso_max_segs;
3803

3804
		needed_headroom = lowerdev->hard_header_len;
3805

3806 3807
		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
					   VXLAN_HEADROOM);
3808 3809 3810 3811 3812
		if (max_mtu < ETH_MIN_MTU)
			max_mtu = ETH_MIN_MTU;

		if (!changelink && !conf->mtu)
			dev->mtu = max_mtu;
3813 3814
	}

3815 3816 3817
	if (dev->mtu > max_mtu)
		dev->mtu = max_mtu;

3818 3819 3820 3821 3822 3823
	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
		needed_headroom += VXLAN6_HEADROOM;
	else
		needed_headroom += VXLAN_HEADROOM;
	dev->needed_headroom = needed_headroom;

3824
	memcpy(&vxlan->cfg, conf, sizeof(*conf));
3825
}
3826

3827
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3828 3829
			       struct vxlan_config *conf, bool changelink,
			       struct netlink_ext_ack *extack)
3830 3831 3832 3833
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct net_device *lowerdev;
	int ret;
3834

3835
	ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3836 3837
	if (ret)
		return ret;
R
Roopa Prabhu 已提交
3838

3839
	vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3840 3841 3842 3843

	return 0;
}

N
Nicolas Dichtel 已提交
3844
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3845 3846
			      struct vxlan_config *conf,
			      struct netlink_ext_ack *extack)
N
Nicolas Dichtel 已提交
3847 3848 3849
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan = netdev_priv(dev);
3850
	struct net_device *remote_dev = NULL;
3851
	struct vxlan_fdb *f = NULL;
3852
	bool unregister = false;
3853
	struct vxlan_rdst *dst;
N
Nicolas Dichtel 已提交
3854 3855
	int err;

3856
	dst = &vxlan->default_dst;
3857
	err = vxlan_dev_configure(net, dev, conf, false, extack);
N
Nicolas Dichtel 已提交
3858 3859 3860 3861 3862 3863
	if (err)
		return err;

	dev->ethtool_ops = &vxlan_ethtool_ops;

	/* create an fdb entry for a valid default destination */
3864
	if (!vxlan_addr_any(&dst->remote_ip)) {
3865
		err = vxlan_fdb_create(vxlan, all_zeros_mac,
3866
				       &dst->remote_ip,
N
Nicolas Dichtel 已提交
3867 3868
				       NUD_REACHABLE | NUD_PERMANENT,
				       vxlan->cfg.dst_port,
3869 3870 3871
				       dst->remote_vni,
				       dst->remote_vni,
				       dst->remote_ifindex,
3872
				       NTF_SELF, 0, &f, extack);
N
Nicolas Dichtel 已提交
3873 3874 3875 3876 3877
		if (err)
			return err;
	}

	err = register_netdevice(dev);
3878 3879
	if (err)
		goto errout;
3880
	unregister = true;
3881

3882 3883 3884 3885 3886 3887 3888 3889 3890 3891
	if (dst->remote_ifindex) {
		remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
		if (!remote_dev)
			goto errout;

		err = netdev_upper_dev_link(remote_dev, dev, extack);
		if (err)
			goto errout;
	}

3892
	err = rtnl_configure_link(dev, NULL);
3893
	if (err)
3894
		goto unlink;
N
Nicolas Dichtel 已提交
3895

3896
	if (f) {
3897
		vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
3898 3899

		/* notify default fdb entry */
3900
		err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
3901
				       RTM_NEWNEIGH, true, extack);
3902 3903
		if (err) {
			vxlan_fdb_destroy(vxlan, f, false, false);
3904 3905
			if (remote_dev)
				netdev_upper_dev_unlink(remote_dev, dev);
3906 3907
			goto unregister;
		}
3908
	}
3909

N
Nicolas Dichtel 已提交
3910
	list_add(&vxlan->next, &vn->vxlan_list);
3911 3912
	if (remote_dev)
		dst->remote_dev = remote_dev;
N
Nicolas Dichtel 已提交
3913
	return 0;
3914 3915 3916
unlink:
	if (remote_dev)
		netdev_upper_dev_unlink(remote_dev, dev);
3917
errout:
3918 3919 3920 3921
	/* unregister_netdevice() destroys the default FDB entry with deletion
	 * notification. But the addition notification was not sent yet, so
	 * destroy the entry by hand here.
	 */
3922
	if (f)
3923 3924
		__vxlan_fdb_free(f);
unregister:
3925 3926
	if (unregister)
		unregister_netdevice(dev);
3927
	return err;
N
Nicolas Dichtel 已提交
3928 3929
}

3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957
/* Set/clear flags based on attribute */
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
			  int attrtype, unsigned long mask, bool changelink,
			  bool changelink_supported,
			  struct netlink_ext_ack *extack)
{
	unsigned long flags;

	if (!tb[attrtype])
		return 0;

	if (changelink && !changelink_supported) {
		vxlan_flag_attr_error(attrtype, extack);
		return -EOPNOTSUPP;
	}

	if (vxlan_policy[attrtype].type == NLA_FLAG)
		flags = conf->flags | mask;
	else if (nla_get_u8(tb[attrtype]))
		flags = conf->flags | mask;
	else
		flags = conf->flags & ~mask;

	conf->flags = flags;

	return 0;
}

R
Roopa Prabhu 已提交
3958 3959
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
			 struct net_device *dev, struct vxlan_config *conf,
3960
			 bool changelink, struct netlink_ext_ack *extack)
3961
{
R
Roopa Prabhu 已提交
3962
	struct vxlan_dev *vxlan = netdev_priv(dev);
3963
	int err = 0;
3964

R
Roopa Prabhu 已提交
3965
	memset(conf, 0, sizeof(*conf));
3966

R
Roopa Prabhu 已提交
3967 3968 3969 3970 3971 3972 3973
	/* if changelink operation, start with old existing cfg */
	if (changelink)
		memcpy(conf, &vxlan->cfg, sizeof(*conf));

	if (data[IFLA_VXLAN_ID]) {
		__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

3974 3975
		if (changelink && (vni != conf->vni)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
R
Roopa Prabhu 已提交
3976
			return -EOPNOTSUPP;
3977
		}
R
Roopa Prabhu 已提交
3978 3979
		conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
	}
3980 3981

	if (data[IFLA_VXLAN_GROUP]) {
3982 3983
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
3984
			return -EOPNOTSUPP;
3985
		}
3986

R
Roopa Prabhu 已提交
3987
		conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
3988
		conf->remote_ip.sa.sa_family = AF_INET;
3989
	} else if (data[IFLA_VXLAN_GROUP6]) {
3990 3991
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
3992
			return -EPFNOSUPPORT;
3993
		}
3994

3995 3996
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
3997
			return -EOPNOTSUPP;
3998
		}
3999

R
Roopa Prabhu 已提交
4000 4001
		conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
		conf->remote_ip.sa.sa_family = AF_INET6;
4002 4003 4004
	}

	if (data[IFLA_VXLAN_LOCAL]) {
4005 4006
		if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
4007
			return -EOPNOTSUPP;
4008
		}
4009

R
Roopa Prabhu 已提交
4010 4011
		conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
		conf->saddr.sa.sa_family = AF_INET;
4012
	} else if (data[IFLA_VXLAN_LOCAL6]) {
4013 4014
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
4015
			return -EPFNOSUPPORT;
4016
		}
4017

4018 4019
		if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
4020
			return -EOPNOTSUPP;
4021
		}
4022

4023
		/* TODO: respect scope id */
R
Roopa Prabhu 已提交
4024 4025
		conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
		conf->saddr.sa.sa_family = AF_INET6;
4026 4027 4028
	}

	if (data[IFLA_VXLAN_LINK])
R
Roopa Prabhu 已提交
4029
		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
4030

S
stephen hemminger 已提交
4031
	if (data[IFLA_VXLAN_TOS])
R
Roopa Prabhu 已提交
4032
		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
S
stephen hemminger 已提交
4033

4034
	if (data[IFLA_VXLAN_TTL])
R
Roopa Prabhu 已提交
4035
		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
4036

H
Hangbin Liu 已提交
4037
	if (data[IFLA_VXLAN_TTL_INHERIT]) {
4038 4039 4040 4041 4042 4043
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
				    VXLAN_F_TTL_INHERIT, changelink, false,
				    extack);
		if (err)
			return err;

H
Hangbin Liu 已提交
4044 4045
	}

4046
	if (data[IFLA_VXLAN_LABEL])
R
Roopa Prabhu 已提交
4047
		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
4048 4049
			     IPV6_FLOWLABEL_MASK;

R
Roopa Prabhu 已提交
4050
	if (data[IFLA_VXLAN_LEARNING]) {
4051 4052 4053 4054 4055
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
				    VXLAN_F_LEARN, changelink, true,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4056 4057 4058 4059
	} else if (!changelink) {
		/* default to learn on a new device */
		conf->flags |= VXLAN_F_LEARN;
	}
S
stephen hemminger 已提交
4060

I
Ido Schimmel 已提交
4061
	if (data[IFLA_VXLAN_AGEING])
R
Roopa Prabhu 已提交
4062
		conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
S
stephen hemminger 已提交
4063

R
Roopa Prabhu 已提交
4064
	if (data[IFLA_VXLAN_PROXY]) {
4065 4066 4067 4068 4069
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
				    VXLAN_F_PROXY, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4070
	}
D
David Stevens 已提交
4071

R
Roopa Prabhu 已提交
4072
	if (data[IFLA_VXLAN_RSC]) {
4073 4074 4075 4076 4077
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
				    VXLAN_F_RSC, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4078
	}
D
David Stevens 已提交
4079

R
Roopa Prabhu 已提交
4080
	if (data[IFLA_VXLAN_L2MISS]) {
4081 4082 4083 4084 4085
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
				    VXLAN_F_L2MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4086
	}
D
David Stevens 已提交
4087

R
Roopa Prabhu 已提交
4088
	if (data[IFLA_VXLAN_L3MISS]) {
4089 4090 4091 4092 4093
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
				    VXLAN_F_L3MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4094
	}
D
David Stevens 已提交
4095

R
Roopa Prabhu 已提交
4096
	if (data[IFLA_VXLAN_LIMIT]) {
4097 4098 4099
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
					    "Cannot change limit");
R
Roopa Prabhu 已提交
4100
			return -EOPNOTSUPP;
4101
		}
R
Roopa Prabhu 已提交
4102 4103
		conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
	}
S
stephen hemminger 已提交
4104

R
Roopa Prabhu 已提交
4105
	if (data[IFLA_VXLAN_COLLECT_METADATA]) {
4106 4107 4108 4109 4110
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
				    VXLAN_F_COLLECT_METADATA, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4111
	}
4112

4113
	if (data[IFLA_VXLAN_PORT_RANGE]) {
R
Roopa Prabhu 已提交
4114 4115 4116 4117 4118 4119
		if (!changelink) {
			const struct ifla_vxlan_port_range *p
				= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
			conf->port_min = ntohs(p->low);
			conf->port_max = ntohs(p->high);
		} else {
4120 4121
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
					    "Cannot change port range");
R
Roopa Prabhu 已提交
4122 4123
			return -EOPNOTSUPP;
		}
4124 4125
	}

R
Roopa Prabhu 已提交
4126
	if (data[IFLA_VXLAN_PORT]) {
4127 4128 4129
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
					    "Cannot change port");
R
Roopa Prabhu 已提交
4130
			return -EOPNOTSUPP;
4131
		}
R
Roopa Prabhu 已提交
4132 4133
		conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
	}
4134

R
Roopa Prabhu 已提交
4135
	if (data[IFLA_VXLAN_UDP_CSUM]) {
4136 4137 4138
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
					    "Cannot change UDP_CSUM flag");
R
Roopa Prabhu 已提交
4139
			return -EOPNOTSUPP;
4140
		}
R
Roopa Prabhu 已提交
4141 4142 4143
		if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
	}
4144

R
Roopa Prabhu 已提交
4145
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
4146 4147 4148 4149 4150
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4151
	}
4152

R
Roopa Prabhu 已提交
4153
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
4154 4155 4156 4157 4158
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
				    VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4159
	}
4160

R
Roopa Prabhu 已提交
4161
	if (data[IFLA_VXLAN_REMCSUM_TX]) {
4162 4163 4164 4165 4166
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
				    VXLAN_F_REMCSUM_TX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4167
	}
T
Tom Herbert 已提交
4168

R
Roopa Prabhu 已提交
4169
	if (data[IFLA_VXLAN_REMCSUM_RX]) {
4170 4171 4172 4173 4174
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
				    VXLAN_F_REMCSUM_RX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4175 4176 4177
	}

	if (data[IFLA_VXLAN_GBP]) {
4178 4179 4180 4181
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
				    VXLAN_F_GBP, changelink, false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4182 4183 4184
	}

	if (data[IFLA_VXLAN_GPE]) {
4185 4186 4187 4188 4189
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
				    VXLAN_F_GPE, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4190 4191 4192
	}

	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
4193 4194 4195 4196 4197
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
				    VXLAN_F_REMCSUM_NOPARTIAL, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4198 4199 4200
	}

	if (tb[IFLA_MTU]) {
4201 4202 4203
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "Cannot change mtu");
R
Roopa Prabhu 已提交
4204
			return -EOPNOTSUPP;
4205
		}
R
Roopa Prabhu 已提交
4206 4207 4208
		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
	}

4209 4210 4211
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

R
Roopa Prabhu 已提交
4212 4213 4214 4215
	return 0;
}

static int vxlan_newlink(struct net *src_net, struct net_device *dev,
4216 4217
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4218 4219 4220 4221
{
	struct vxlan_config conf;
	int err;

4222
	err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
R
Roopa Prabhu 已提交
4223 4224 4225
	if (err)
		return err;

4226
	return __vxlan_dev_create(src_net, dev, &conf, extack);
R
Roopa Prabhu 已提交
4227
}
T
Tom Herbert 已提交
4228

R
Roopa Prabhu 已提交
4229
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
4230 4231
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4232 4233
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4234
	struct net_device *lowerdev;
R
Roopa Prabhu 已提交
4235
	struct vxlan_config conf;
4236
	struct vxlan_rdst *dst;
R
Roopa Prabhu 已提交
4237 4238
	int err;

4239
	dst = &vxlan->default_dst;
4240
	err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
R
Roopa Prabhu 已提交
4241 4242
	if (err)
		return err;
T
Thomas Graf 已提交
4243

4244 4245
	err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
				    vxlan, extack);
R
Roopa Prabhu 已提交
4246 4247
	if (err)
		return err;
4248

4249 4250 4251
	if (dst->remote_dev == lowerdev)
		lowerdev = NULL;

4252 4253 4254 4255 4256
	err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
					     extack);
	if (err)
		return err;

R
Roopa Prabhu 已提交
4257
	/* handle default dst entry */
4258
	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
4259 4260 4261
		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);

		spin_lock_bh(&vxlan->hash_lock[hash_index]);
4262
		if (!vxlan_addr_any(&conf.remote_ip)) {
4263
			err = vxlan_fdb_update(vxlan, all_zeros_mac,
4264
					       &conf.remote_ip,
R
Roopa Prabhu 已提交
4265
					       NUD_REACHABLE | NUD_PERMANENT,
4266
					       NLM_F_APPEND | NLM_F_CREATE,
R
Roopa Prabhu 已提交
4267
					       vxlan->cfg.dst_port,
4268 4269
					       conf.vni, conf.vni,
					       conf.remote_ifindex,
4270
					       NTF_SELF, 0, true, extack);
R
Roopa Prabhu 已提交
4271
			if (err) {
4272
				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4273 4274
				netdev_adjacent_change_abort(dst->remote_dev,
							     lowerdev, dev);
R
Roopa Prabhu 已提交
4275 4276 4277
				return err;
			}
		}
4278 4279 4280 4281 4282 4283 4284 4285
		if (!vxlan_addr_any(&dst->remote_ip))
			__vxlan_fdb_delete(vxlan, all_zeros_mac,
					   dst->remote_ip,
					   vxlan->cfg.dst_port,
					   dst->remote_vni,
					   dst->remote_vni,
					   dst->remote_ifindex,
					   true);
4286
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
R
Roopa Prabhu 已提交
4287
	}
4288

4289 4290 4291
	if (conf.age_interval != vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies);

4292
	netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
4293
	if (lowerdev && lowerdev != dst->remote_dev)
4294
		dst->remote_dev = lowerdev;
4295
	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
R
Roopa Prabhu 已提交
4296
	return 0;
S
stephen hemminger 已提交
4297 4298 4299 4300 4301 4302
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

4303 4304
	vxlan_flush(vxlan, true);

4305
	list_del(&vxlan->next);
S
stephen hemminger 已提交
4306
	unregister_netdevice_queue(dev, head);
4307 4308
	if (vxlan->default_dst.remote_dev)
		netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
S
stephen hemminger 已提交
4309 4310 4311 4312 4313 4314
}

static size_t vxlan_get_size(const struct net_device *dev)
{

	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
C
Cong Wang 已提交
4315
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
S
stephen hemminger 已提交
4316
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
C
Cong Wang 已提交
4317
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
S
stephen hemminger 已提交
4318
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
H
Hangbin Liu 已提交
4319
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
S
stephen hemminger 已提交
4320
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
4321
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
4322
		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
S
stephen hemminger 已提交
4323
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
D
David Stevens 已提交
4324 4325 4326 4327
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
4328
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
S
stephen hemminger 已提交
4329 4330
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
4331
		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
4332 4333 4334 4335
		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
T
Tom Herbert 已提交
4336 4337
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
S
stephen hemminger 已提交
4338 4339 4340 4341 4342 4343
		0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	const struct vxlan_dev *vxlan = netdev_priv(dev);
4344
	const struct vxlan_rdst *dst = &vxlan->default_dst;
4345
	struct ifla_vxlan_port_range ports = {
4346 4347
		.low =  htons(vxlan->cfg.port_min),
		.high = htons(vxlan->cfg.port_max),
4348
	};
S
stephen hemminger 已提交
4349

4350
	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
S
stephen hemminger 已提交
4351 4352
		goto nla_put_failure;

C
Cong Wang 已提交
4353 4354
	if (!vxlan_addr_any(&dst->remote_ip)) {
		if (dst->remote_ip.sa.sa_family == AF_INET) {
4355 4356
			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
					    dst->remote_ip.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4357 4358 4359
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4360 4361
			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
					     &dst->remote_ip.sin6.sin6_addr))
C
Cong Wang 已提交
4362 4363 4364 4365
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4366

4367
	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
S
stephen hemminger 已提交
4368 4369
		goto nla_put_failure;

4370 4371
	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
4372
			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
4373
					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4374 4375 4376
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4377
			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
4378
					     &vxlan->cfg.saddr.sin6.sin6_addr))
C
Cong Wang 已提交
4379 4380 4381 4382
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4383

4384
	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
H
Hangbin Liu 已提交
4385 4386
	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
4387
	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
4388
	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
4389
	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
D
David Stevens 已提交
4390
	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
4391
		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
D
David Stevens 已提交
4392
	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
4393
		       !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
4394 4395
	    nla_put_u8(skb, IFLA_VXLAN_RSC,
		       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
D
David Stevens 已提交
4396
	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
4397
		       !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
D
David Stevens 已提交
4398
	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
4399
		       !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
4400
	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
4401
		       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
4402 4403 4404
	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
4405
	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
4406
		       !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
4407
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
4408
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
4409
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
4410
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
T
Tom Herbert 已提交
4411
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
4412
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
T
Tom Herbert 已提交
4413
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
4414
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
S
stephen hemminger 已提交
4415 4416
		goto nla_put_failure;

4417 4418 4419
	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
		goto nla_put_failure;

4420
	if (vxlan->cfg.flags & VXLAN_F_GBP &&
T
Thomas Graf 已提交
4421 4422 4423
	    nla_put_flag(skb, IFLA_VXLAN_GBP))
		goto nla_put_failure;

4424
	if (vxlan->cfg.flags & VXLAN_F_GPE &&
J
Jiri Benc 已提交
4425 4426 4427
	    nla_put_flag(skb, IFLA_VXLAN_GPE))
		goto nla_put_failure;

4428
	if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
4429 4430 4431
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

S
stephen hemminger 已提交
4432 4433 4434 4435 4436 4437
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4438 4439 4440 4441 4442 4443 4444
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	return vxlan->net;
}

S
stephen hemminger 已提交
4445 4446 4447 4448 4449 4450 4451 4452
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
	.kind		= "vxlan",
	.maxtype	= IFLA_VXLAN_MAX,
	.policy		= vxlan_policy,
	.priv_size	= sizeof(struct vxlan_dev),
	.setup		= vxlan_setup,
	.validate	= vxlan_validate,
	.newlink	= vxlan_newlink,
R
Roopa Prabhu 已提交
4453
	.changelink	= vxlan_changelink,
S
stephen hemminger 已提交
4454 4455 4456
	.dellink	= vxlan_dellink,
	.get_size	= vxlan_get_size,
	.fill_info	= vxlan_fill_info,
4457
	.get_link_net	= vxlan_get_link_net,
S
stephen hemminger 已提交
4458 4459
};

4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
struct net_device *vxlan_dev_create(struct net *net, const char *name,
				    u8 name_assign_type,
				    struct vxlan_config *conf)
{
	struct nlattr *tb[IFLA_MAX + 1];
	struct net_device *dev;
	int err;

	memset(&tb, 0, sizeof(tb));

	dev = rtnl_create_link(net, name, name_assign_type,
4471
			       &vxlan_link_ops, tb, NULL);
4472 4473 4474
	if (IS_ERR(dev))
		return dev;

4475
	err = __vxlan_dev_create(net, dev, conf, NULL);
4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493
	if (err < 0) {
		free_netdev(dev);
		return ERR_PTR(err);
	}

	err = rtnl_configure_link(dev, NULL);
	if (err < 0) {
		LIST_HEAD(list_kill);

		vxlan_dellink(dev, &list_kill);
		unregister_netdevice_many(&list_kill);
		return ERR_PTR(err);
	}

	return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
					     struct net_device *dev)
{
	struct vxlan_dev *vxlan, *next;
	LIST_HEAD(list_kill);

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		struct vxlan_rdst *dst = &vxlan->default_dst;

		/* In case we created vxlan device with carrier
		 * and we loose the carrier due to module unload
		 * we also need to remove vxlan device. In other
		 * cases, it's not necessary and remote_ifindex
		 * is 0 here, so no matches.
		 */
		if (dst->remote_ifindex == dev->ifindex)
			vxlan_dellink(vxlan->dev, &list_kill);
	}

	unregister_netdevice_many(&list_kill);
}

4516 4517
static int vxlan_netdevice_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
4518 4519
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4520
	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4521

4522
	if (event == NETDEV_UNREGISTER) {
4523 4524
		if (!dev->udp_tunnel_nic_info)
			vxlan_offload_rx_ports(dev, false);
4525
		vxlan_handle_lowerdev_unregister(vn, dev);
4526
	} else if (event == NETDEV_REGISTER) {
4527 4528
		if (!dev->udp_tunnel_nic_info)
			vxlan_offload_rx_ports(dev, true);
4529 4530
	} else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO ||
		   event == NETDEV_UDP_TUNNEL_DROP_INFO) {
4531
		vxlan_offload_rx_ports(dev, event == NETDEV_UDP_TUNNEL_PUSH_INFO);
4532
	}
4533 4534 4535 4536 4537

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
4538
	.notifier_call = vxlan_netdevice_event,
4539 4540
};

4541 4542 4543 4544 4545 4546 4547
static void
vxlan_fdb_offloaded_set(struct net_device *dev,
			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
4548 4549 4550
	u32 hash_index;

	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4551

4552
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		goto out;

	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
				   fdb_info->remote_port,
				   fdb_info->remote_vni,
				   fdb_info->remote_ifindex);
	if (!rdst)
		goto out;

	rdst->offloaded = fdb_info->offloaded;

out:
4568
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4569 4570
}

P
Petr Machata 已提交
4571 4572 4573 4574 4575
static int
vxlan_fdb_external_learn_add(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4576
	struct netlink_ext_ack *extack;
4577
	u32 hash_index;
P
Petr Machata 已提交
4578 4579
	int err;

4580
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4581 4582
	extack = switchdev_notifier_info_to_extack(&fdb_info->info);

4583
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4584 4585 4586 4587 4588 4589 4590 4591
	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
			       NUD_REACHABLE,
			       NLM_F_CREATE | NLM_F_REPLACE,
			       fdb_info->remote_port,
			       fdb_info->vni,
			       fdb_info->remote_vni,
			       fdb_info->remote_ifindex,
			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4592
			       0, false, extack);
4593
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4594 4595 4596 4597 4598 4599 4600 4601 4602 4603

	return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
4604
	u32 hash_index;
P
Petr Machata 已提交
4605 4606
	int err = 0;

4607 4608
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		err = -ENOENT;
	else if (f->flags & NTF_EXT_LEARNED)
		err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
					 fdb_info->remote_ip,
					 fdb_info->remote_port,
					 fdb_info->vni,
					 fdb_info->remote_vni,
					 fdb_info->remote_ifindex,
					 false);

4622
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4623 4624 4625 4626

	return err;
}

4627 4628 4629 4630
static int vxlan_switchdev_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
{
	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
P
Petr Machata 已提交
4631 4632
	struct switchdev_notifier_vxlan_fdb_info *fdb_info;
	int err = 0;
4633 4634 4635 4636 4637

	switch (event) {
	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
		vxlan_fdb_offloaded_set(dev, ptr);
		break;
P
Petr Machata 已提交
4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657
	case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_add(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = true;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
	case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_del(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = false;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
4658 4659
	}

P
Petr Machata 已提交
4660
	return err;
4661 4662 4663 4664 4665 4666
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
	.notifier_call = vxlan_switchdev_event,
};

4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686
static void vxlan_fdb_nh_flush(struct nexthop *nh)
{
	struct vxlan_fdb *fdb;
	struct vxlan_dev *vxlan;
	u32 hash_index;

	rcu_read_lock();
	list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) {
		vxlan = rcu_dereference(fdb->vdev);
		WARN_ON(!vxlan);
		hash_index = fdb_head_index(vxlan, fdb->eth_addr,
					    vxlan->default_dst.remote_vni);
		spin_lock_bh(&vxlan->hash_lock[hash_index]);
		if (!hlist_unhashed(&fdb->hlist))
			vxlan_fdb_destroy(vxlan, fdb, false, false);
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
	}
	rcu_read_unlock();
}

4687 4688 4689 4690 4691 4692 4693 4694
static int vxlan_nexthop_event(struct notifier_block *nb,
			       unsigned long event, void *ptr)
{
	struct nexthop *nh = ptr;

	if (!nh || event != NEXTHOP_EVENT_DEL)
		return NOTIFY_DONE;

4695
	vxlan_fdb_nh_flush(nh);
4696 4697 4698 4699 4700 4701 4702 4703

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_nexthop_notifier_block __read_mostly = {
	.notifier_call = vxlan_nexthop_event,
};

S
stephen hemminger 已提交
4704 4705 4706
static __net_init int vxlan_init_net(struct net *net)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4707
	unsigned int h;
S
stephen hemminger 已提交
4708

4709
	INIT_LIST_HEAD(&vn->vxlan_list);
4710
	spin_lock_init(&vn->sock_lock);
S
stephen hemminger 已提交
4711

4712 4713
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vn->sock_list[h]);
S
stephen hemminger 已提交
4714

4715
	return register_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
S
stephen hemminger 已提交
4716 4717
}

4718
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
N
Nicolas Dichtel 已提交
4719 4720 4721 4722
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan, *next;
	struct net_device *dev, *aux;
4723
	unsigned int h;
N
Nicolas Dichtel 已提交
4724 4725 4726

	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &vxlan_link_ops)
4727
			unregister_netdevice_queue(dev, head);
N
Nicolas Dichtel 已提交
4728 4729 4730 4731 4732

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		/* If vxlan->dev is in the same netns, it has already been added
		 * to the list by the previous loop.
		 */
4733
		if (!net_eq(dev_net(vxlan->dev), net))
4734
			unregister_netdevice_queue(vxlan->dev, head);
N
Nicolas Dichtel 已提交
4735 4736
	}

4737 4738
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
N
Nicolas Dichtel 已提交
4739 4740
}

4741 4742 4743 4744 4745 4746
static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
{
	struct net *net;
	LIST_HEAD(list);

	rtnl_lock();
4747 4748
	list_for_each_entry(net, net_list, exit_list)
		unregister_nexthop_notifier(net, &vxlan_nexthop_notifier_block);
4749 4750 4751 4752 4753 4754 4755
	list_for_each_entry(net, net_list, exit_list)
		vxlan_destroy_tunnels(net, &list);

	unregister_netdevice_many(&list);
	rtnl_unlock();
}

S
stephen hemminger 已提交
4756 4757
static struct pernet_operations vxlan_net_ops = {
	.init = vxlan_init_net,
4758
	.exit_batch = vxlan_exit_batch_net,
S
stephen hemminger 已提交
4759 4760 4761 4762 4763 4764 4765 4766 4767 4768
	.id   = &vxlan_net_id,
	.size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
	int rc;

	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

4769
	rc = register_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4770 4771 4772
	if (rc)
		goto out1;

4773
	rc = register_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4774 4775 4776
	if (rc)
		goto out2;

4777
	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4778 4779
	if (rc)
		goto out3;
S
stephen hemminger 已提交
4780

4781 4782 4783 4784
	rc = rtnl_link_register(&vxlan_link_ops);
	if (rc)
		goto out4;

4785
	return 0;
4786 4787
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4788 4789
out3:
	unregister_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4790
out2:
4791
	unregister_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4792 4793 4794
out1:
	return rc;
}
4795
late_initcall(vxlan_init_module);
S
stephen hemminger 已提交
4796 4797 4798

static void __exit vxlan_cleanup_module(void)
{
4799
	rtnl_link_unregister(&vxlan_link_ops);
4800
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4801
	unregister_netdevice_notifier(&vxlan_notifier_block);
4802 4803
	unregister_pernet_subsys(&vxlan_net_ops);
	/* rcu_barrier() is called by netns */
S
stephen hemminger 已提交
4804 4805 4806 4807 4808
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
4809
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
J
Jesse Brandeburg 已提交
4810
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
S
stephen hemminger 已提交
4811
MODULE_ALIAS_RTNL_LINK("vxlan");