vxlan.c 121.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
S
stephen hemminger 已提交
2
/*
R
Rami Rosen 已提交
3
 * VXLAN: Virtual eXtensible Local Area Network
S
stephen hemminger 已提交
4
 *
5
 * Copyright (c) 2012-2013 Vyatta Inc.
S
stephen hemminger 已提交
6 7 8 9 10 11 12 13 14 15 16
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/udp.h>
#include <linux/igmp.h>
#include <linux/if_ether.h>
Y
Yan Burman 已提交
17
#include <linux/ethtool.h>
D
David Stevens 已提交
18 19
#include <net/arp.h>
#include <net/ndisc.h>
20
#include <net/ipv6_stubs.h>
S
stephen hemminger 已提交
21 22 23 24 25 26
#include <net/ip.h>
#include <net/icmp.h>
#include <net/rtnetlink.h>
#include <net/inet_ecn.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
27
#include <net/tun_proto.h>
28
#include <net/vxlan.h>
29
#include <net/nexthop.h>
30

C
Cong Wang 已提交
31 32
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ip6_tunnel.h>
33
#include <net/ip6_checksum.h>
C
Cong Wang 已提交
34
#endif
S
stephen hemminger 已提交
35 36 37

#define VXLAN_VERSION	"0.1"

38 39
#define PORT_HASH_BITS	8
#define PORT_HASH_SIZE  (1<<PORT_HASH_BITS)
S
stephen hemminger 已提交
40 41 42
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ)	/* rescan interval */

43 44
/* UDP port for VXLAN traffic.
 * The IANA assigned port is 4789, but the Linux default is 8472
S
Stephen Hemminger 已提交
45
 * for compatibility with early adopters.
46
 */
47 48
static unsigned short vxlan_port __read_mostly = 8472;
module_param_named(udp_port, vxlan_port, ushort, 0444);
S
stephen hemminger 已提交
49 50 51 52 53 54
MODULE_PARM_DESC(udp_port, "Destination UDP port");

static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");

55
static unsigned int vxlan_net_id;
56
static struct rtnl_link_ops vxlan_link_ops;
57

58
static const u8 all_zeros_mac[ETH_ALEN + 2];
59

60
static int vxlan_sock_add(struct vxlan_dev *vxlan);
61

62 63
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);

64 65 66 67
/* per-network namespace private data for this module */
struct vxlan_net {
	struct list_head  vxlan_list;
	struct hlist_head sock_list[PORT_HASH_SIZE];
68
	spinlock_t	  sock_lock;
69
	struct notifier_block nexthop_notifier_block;
70 71
};

S
stephen hemminger 已提交
72 73 74 75 76 77
/* Forwarding table entry */
struct vxlan_fdb {
	struct hlist_node hlist;	/* linked list of entries */
	struct rcu_head	  rcu;
	unsigned long	  updated;	/* jiffies */
	unsigned long	  used;
78
	struct list_head  remotes;
79
	u8		  eth_addr[ETH_ALEN];
S
stephen hemminger 已提交
80
	u16		  state;	/* see ndm_state */
81
	__be32		  vni;
P
Petr Machata 已提交
82
	u16		  flags;	/* see ndm_flags and below */
83 84
	struct list_head  nh_list;
	struct nexthop __rcu *nh;
85
	struct vxlan_dev  __rcu *vdev;
S
stephen hemminger 已提交
86 87
};

P
Petr Machata 已提交
88 89
#define NTF_VXLAN_ADDED_BY_USER 0x100

S
stephen hemminger 已提交
90 91 92
/* salt for hash table */
static u32 vxlan_salt __read_mostly;

T
Thomas Graf 已提交
93 94
static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
{
95 96
	return vs->flags & VXLAN_F_COLLECT_METADATA ||
	       ip_tunnel_collect_metadata();
T
Thomas Graf 已提交
97 98
}

C
Cong Wang 已提交
99 100 101 102
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
103 104 105 106 107 108
	if (a->sa.sa_family != b->sa.sa_family)
		return false;
	if (a->sa.sa_family == AF_INET6)
		return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
	else
		return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
109 110 111 112
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
113
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
114
		ip->sin6.sin6_addr = nla_get_in6_addr(nla);
J
Jiri Benc 已提交
115 116 117
		ip->sa.sa_family = AF_INET6;
		return 0;
	} else if (nla_len(nla) >= sizeof(__be32)) {
118
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
119 120 121 122 123
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
124 125 126
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
127
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
128
{
J
Jiri Benc 已提交
129
	if (ip->sa.sa_family == AF_INET6)
130
		return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
J
Jiri Benc 已提交
131
	else
132
		return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
133 134 135 136 137 138 139
}

#else /* !CONFIG_IPV6 */

static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
J
Jiri Benc 已提交
140
	return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
C
Cong Wang 已提交
141 142 143 144
}

static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
J
Jiri Benc 已提交
145 146 147
	if (nla_len(nla) >= sizeof(struct in6_addr)) {
		return -EAFNOSUPPORT;
	} else if (nla_len(nla) >= sizeof(__be32)) {
148
		ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
J
Jiri Benc 已提交
149 150 151 152 153
		ip->sa.sa_family = AF_INET;
		return 0;
	} else {
		return -EAFNOSUPPORT;
	}
C
Cong Wang 已提交
154 155 156
}

static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
J
Jiri Benc 已提交
157
			      const union vxlan_addr *ip)
C
Cong Wang 已提交
158
{
159
	return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
C
Cong Wang 已提交
160 161 162
}
#endif

163
/* Virtual Network hash table head */
164
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
165
{
166
	return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
167 168 169 170
}

/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
S
stephen hemminger 已提交
171 172 173
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);

174 175 176
	return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}

177 178 179
/* First remote destination for a forwarding entry.
 * Guaranteed to be non-NULL because remotes are never deleted.
 */
180
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
181
{
182 183
	if (rcu_access_pointer(fdb->nh))
		return NULL;
184 185 186 187 188
	return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}

static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
189 190
	if (rcu_access_pointer(fdb->nh))
		return NULL;
191
	return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
192 193
}

194 195 196
/* Find VXLAN socket based on network namespace, address family, UDP port,
 * enabled unshareable flags and socket device binding (see l3mdev with
 * non-default VRF).
197 198
 */
static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
199
					  __be16 port, u32 flags, int ifindex)
200 201
{
	struct vxlan_sock *vs;
202 203

	flags &= VXLAN_F_RCV_FLAGS;
204 205

	hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
206
		if (inet_sk(vs->sock->sk)->inet_sport == port &&
207
		    vxlan_get_sk_family(vs) == family &&
208 209
		    vs->flags == flags &&
		    vs->sock->sk->sk_bound_dev_if == ifindex)
210 211 212
			return vs;
	}
	return NULL;
S
stephen hemminger 已提交
213 214
}

215 216
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
					   __be32 vni)
S
stephen hemminger 已提交
217
{
J
Jiri Benc 已提交
218
	struct vxlan_dev_node *node;
S
stephen hemminger 已提交
219

220 221 222 223
	/* For flow based devices, map all packets to VNI 0 */
	if (vs->flags & VXLAN_F_COLLECT_METADATA)
		vni = 0;

J
Jiri Benc 已提交
224 225
	hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
		if (node->vxlan->default_dst.remote_vni != vni)
226 227 228
			continue;

		if (IS_ENABLED(CONFIG_IPV6)) {
J
Jiri Benc 已提交
229
			const struct vxlan_config *cfg = &node->vxlan->cfg;
230 231 232 233 234 235

			if ((cfg->flags & VXLAN_F_IPV6_LINKLOCAL) &&
			    cfg->remote_ifindex != ifindex)
				continue;
		}

J
Jiri Benc 已提交
236
		return node->vxlan;
S
stephen hemminger 已提交
237 238 239 240 241
	}

	return NULL;
}

P
Pravin B Shelar 已提交
242
/* Look up VNI in a per net namespace table */
243 244 245
static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
					__be32 vni, sa_family_t family,
					__be16 port, u32 flags)
P
Pravin B Shelar 已提交
246 247 248
{
	struct vxlan_sock *vs;

249
	vs = vxlan_find_sock(net, family, port, flags, ifindex);
P
Pravin B Shelar 已提交
250 251 252
	if (!vs)
		return NULL;

253
	return vxlan_vs_find_vni(vs, ifindex, vni);
P
Pravin B Shelar 已提交
254 255
}

S
stephen hemminger 已提交
256 257
/* Fill in neighbour message in skbuff. */
static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
S
Stephen Hemminger 已提交
258 259 260
			  const struct vxlan_fdb *fdb,
			  u32 portid, u32 seq, int type, unsigned int flags,
			  const struct vxlan_rdst *rdst)
S
stephen hemminger 已提交
261 262 263
{
	unsigned long now = jiffies;
	struct nda_cacheinfo ci;
264
	bool send_ip, send_eth;
S
stephen hemminger 已提交
265
	struct nlmsghdr *nlh;
266
	struct nexthop *nh;
S
stephen hemminger 已提交
267
	struct ndmsg *ndm;
268 269
	int nh_family;
	u32 nh_id;
S
stephen hemminger 已提交
270 271 272 273 274 275 276

	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
	if (nlh == NULL)
		return -EMSGSIZE;

	ndm = nlmsg_data(nlh);
	memset(ndm, 0, sizeof(*ndm));
D
David Stevens 已提交
277 278 279

	send_eth = send_ip = true;

280 281 282 283 284 285 286 287
	rcu_read_lock();
	nh = rcu_dereference(fdb->nh);
	if (nh) {
		nh_family = nexthop_get_family(nh);
		nh_id = nh->id;
	}
	rcu_read_unlock();

D
David Stevens 已提交
288
	if (type == RTM_GETNEIGH) {
289 290 291 292
		if (rdst) {
			send_ip = !vxlan_addr_any(&rdst->remote_ip);
			ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET;
		} else if (nh) {
293
			ndm->ndm_family = nh_family;
294
		}
D
David Stevens 已提交
295 296 297
		send_eth = !is_zero_ether_addr(fdb->eth_addr);
	} else
		ndm->ndm_family	= AF_BRIDGE;
S
stephen hemminger 已提交
298 299
	ndm->ndm_state = fdb->state;
	ndm->ndm_ifindex = vxlan->dev->ifindex;
300
	ndm->ndm_flags = fdb->flags;
301
	if (rdst && rdst->offloaded)
302
		ndm->ndm_flags |= NTF_OFFLOADED;
303
	ndm->ndm_type = RTN_UNICAST;
S
stephen hemminger 已提交
304

305
	if (!net_eq(dev_net(vxlan->dev), vxlan->net) &&
306
	    nla_put_s32(skb, NDA_LINK_NETNSID,
307
			peernet2id(dev_net(vxlan->dev), vxlan->net)))
308 309
		goto nla_put_failure;

D
David Stevens 已提交
310
	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
S
stephen hemminger 已提交
311
		goto nla_put_failure;
312
	if (nh) {
313
		if (nla_put_u32(skb, NDA_NH_ID, nh_id))
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
			goto nla_put_failure;
	} else if (rdst) {
		if (send_ip && vxlan_nla_put_addr(skb, NDA_DST,
						  &rdst->remote_ip))
			goto nla_put_failure;

		if (rdst->remote_port &&
		    rdst->remote_port != vxlan->cfg.dst_port &&
		    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
			goto nla_put_failure;
		if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
		    nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni)))
			goto nla_put_failure;
		if (rdst->remote_ifindex &&
		    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
			goto nla_put_failure;
	}
S
stephen hemminger 已提交
331

332
	if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni &&
333 334 335
	    nla_put_u32(skb, NDA_SRC_VNI,
			be32_to_cpu(fdb->vni)))
		goto nla_put_failure;
S
stephen hemminger 已提交
336 337 338 339 340 341 342 343 344

	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
	ci.ndm_confirmed = 0;
	ci.ndm_updated	 = jiffies_to_clock_t(now - fdb->updated);
	ci.ndm_refcnt	 = 0;

	if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
		goto nla_put_failure;

345 346
	nlmsg_end(skb, nlh);
	return 0;
S
stephen hemminger 已提交
347 348 349 350 351 352 353 354 355 356

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static inline size_t vxlan_nlmsg_size(void)
{
	return NLMSG_ALIGN(sizeof(struct ndmsg))
		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
C
Cong Wang 已提交
357
		+ nla_total_size(sizeof(struct in6_addr)) /* NDA_DST */
358
		+ nla_total_size(sizeof(__be16)) /* NDA_PORT */
359 360
		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
361
		+ nla_total_size(sizeof(__s32)) /* NDA_LINK_NETNSID */
S
stephen hemminger 已提交
362 363 364
		+ nla_total_size(sizeof(struct nda_cacheinfo));
}

P
Petr Machata 已提交
365 366
static void __vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       struct vxlan_rdst *rd, int type)
S
stephen hemminger 已提交
367 368 369 370 371 372 373 374 375
{
	struct net *net = dev_net(vxlan->dev);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
	if (skb == NULL)
		goto errout;

376
	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, rd);
S
stephen hemminger 已提交
377 378 379 380 381 382 383 384 385 386 387 388 389 390
	if (err < 0) {
		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}

	rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
	return;
errout:
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
}

391 392 393
static void vxlan_fdb_switchdev_notifier_info(const struct vxlan_dev *vxlan,
			    const struct vxlan_fdb *fdb,
			    const struct vxlan_rdst *rd,
394
			    struct netlink_ext_ack *extack,
395 396 397
			    struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	fdb_info->info.dev = vxlan->dev;
398
	fdb_info->info.extack = extack;
399 400 401 402 403 404 405 406 407 408
	fdb_info->remote_ip = rd->remote_ip;
	fdb_info->remote_port = rd->remote_port;
	fdb_info->remote_vni = rd->remote_vni;
	fdb_info->remote_ifindex = rd->remote_ifindex;
	memcpy(fdb_info->eth_addr, fdb->eth_addr, ETH_ALEN);
	fdb_info->vni = fdb->vni;
	fdb_info->offloaded = rd->offloaded;
	fdb_info->added_by_user = fdb->flags & NTF_VXLAN_ADDED_BY_USER;
}

409 410 411
static int vxlan_fdb_switchdev_call_notifiers(struct vxlan_dev *vxlan,
					      struct vxlan_fdb *fdb,
					      struct vxlan_rdst *rd,
412 413
					      bool adding,
					      struct netlink_ext_ack *extack)
P
Petr Machata 已提交
414 415 416
{
	struct switchdev_notifier_vxlan_fdb_info info;
	enum switchdev_notifier_type notifier_type;
417
	int ret;
P
Petr Machata 已提交
418 419

	if (WARN_ON(!rd))
420
		return 0;
P
Petr Machata 已提交
421 422 423

	notifier_type = adding ? SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE
			       : SWITCHDEV_VXLAN_FDB_DEL_TO_DEVICE;
424
	vxlan_fdb_switchdev_notifier_info(vxlan, fdb, rd, NULL, &info);
425
	ret = call_switchdev_notifiers(notifier_type, vxlan->dev,
426
				       &info.info, extack);
427
	return notifier_to_errno(ret);
P
Petr Machata 已提交
428 429
}

430
static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
431 432
			    struct vxlan_rdst *rd, int type, bool swdev_notify,
			    struct netlink_ext_ack *extack)
P
Petr Machata 已提交
433
{
434 435
	int err;

436
	if (swdev_notify && rd) {
437 438
		switch (type) {
		case RTM_NEWNEIGH:
439
			err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
440
								 true, extack);
441 442
			if (err)
				return err;
443 444 445
			break;
		case RTM_DELNEIGH:
			vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd,
446
							   false, extack);
447 448
			break;
		}
P
Petr Machata 已提交
449 450 451
	}

	__vxlan_fdb_notify(vxlan, fdb, rd, type);
452
	return 0;
P
Petr Machata 已提交
453 454
}

C
Cong Wang 已提交
455
static void vxlan_ip_miss(struct net_device *dev, union vxlan_addr *ipa)
D
David Stevens 已提交
456 457
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
458 459 460 461
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
	struct vxlan_rdst remote = {
C
Cong Wang 已提交
462
		.remote_ip = *ipa, /* goes to NDA_DST */
463
		.remote_vni = cpu_to_be32(VXLAN_N_VID),
464
	};
465

466
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
467 468 469 470
}

static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
{
471 472 473
	struct vxlan_fdb f = {
		.state = NUD_STALE,
	};
474
	struct vxlan_rdst remote = { };
D
David Stevens 已提交
475 476 477

	memcpy(f.eth_addr, eth_addr, ETH_ALEN);

478
	vxlan_fdb_notify(vxlan, &f, &remote, RTM_GETNEIGH, true, NULL);
D
David Stevens 已提交
479 480
}

S
stephen hemminger 已提交
481 482 483 484 485 486 487 488
/* Hash Ethernet address */
static u32 eth_hash(const unsigned char *addr)
{
	u64 value = get_unaligned((u64 *)addr);

	/* only want 6 bytes */
#ifdef __BIG_ENDIAN
	value >>= 16;
489 490
#else
	value <<= 16;
S
stephen hemminger 已提交
491 492 493 494
#endif
	return hash_64(value, FDB_HASH_BITS);
}

495 496 497 498 499 500 501 502
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
	/* use 1 byte of OUI and 3 bytes of NIC */
	u32 key = get_unaligned((u32 *)(addr + 2));

	return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}

503 504 505 506 507 508 509 510
static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
		return eth_vni_hash(mac, vni);
	else
		return eth_hash(mac);
}

S
stephen hemminger 已提交
511 512
/* Hash chain to use given mac address */
static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
513
						const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
514
{
515
	return &vxlan->fdb_head[fdb_head_index(vxlan, mac, vni)];
S
stephen hemminger 已提交
516 517 518
}

/* Look up Ethernet address in forwarding table */
519
static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
520
					  const u8 *mac, __be32 vni)
S
stephen hemminger 已提交
521
{
522
	struct hlist_head *head = vxlan_fdb_head(vxlan, mac, vni);
S
stephen hemminger 已提交
523 524
	struct vxlan_fdb *f;

525
	hlist_for_each_entry_rcu(f, head, hlist) {
526
		if (ether_addr_equal(mac, f->eth_addr)) {
527
			if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
528 529 530 531 532 533
				if (vni == f->vni)
					return f;
			} else {
				return f;
			}
		}
S
stephen hemminger 已提交
534 535 536 537 538
	}

	return NULL;
}

539
static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
540
					const u8 *mac, __be32 vni)
541 542 543
{
	struct vxlan_fdb *f;

544
	f = __vxlan_find_mac(vxlan, mac, vni);
545
	if (f && f->used != jiffies)
546 547 548 549 550
		f->used = jiffies;

	return f;
}

551 552
/* caller should hold vxlan->hash_lock */
static struct vxlan_rdst *vxlan_fdb_find_rdst(struct vxlan_fdb *f,
C
Cong Wang 已提交
553
					      union vxlan_addr *ip, __be16 port,
554
					      __be32 vni, __u32 ifindex)
555
{
556
	struct vxlan_rdst *rd;
557

558
	list_for_each_entry(rd, &f->remotes, list) {
C
Cong Wang 已提交
559
		if (vxlan_addr_equal(&rd->remote_ip, ip) &&
560 561 562
		    rd->remote_port == port &&
		    rd->remote_vni == vni &&
		    rd->remote_ifindex == ifindex)
563
			return rd;
564
	}
565

566 567 568
	return NULL;
}

569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
int vxlan_fdb_find_uc(struct net_device *dev, const u8 *mac, __be32 vni,
		      struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	u8 eth_addr[ETH_ALEN + 2] = { 0 };
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	int rc = 0;

	if (is_multicast_ether_addr(mac) ||
	    is_zero_ether_addr(mac))
		return -EINVAL;

	ether_addr_copy(eth_addr, mac);

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, eth_addr, vni);
	if (!f) {
		rc = -ENOENT;
		goto out;
	}

	rdst = first_remote_rcu(f);
593
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, NULL, fdb_info);
594 595 596 597 598 599 600

out:
	rcu_read_unlock();
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_find_uc);

P
Petr Machata 已提交
601 602 603
static int vxlan_fdb_notify_one(struct notifier_block *nb,
				const struct vxlan_dev *vxlan,
				const struct vxlan_fdb *f,
604 605
				const struct vxlan_rdst *rdst,
				struct netlink_ext_ack *extack)
P
Petr Machata 已提交
606 607 608 609
{
	struct switchdev_notifier_vxlan_fdb_info fdb_info;
	int rc;

610
	vxlan_fdb_switchdev_notifier_info(vxlan, f, rdst, extack, &fdb_info);
P
Petr Machata 已提交
611 612 613 614 615 616
	rc = nb->notifier_call(nb, SWITCHDEV_VXLAN_FDB_ADD_TO_DEVICE,
			       &fdb_info);
	return notifier_to_errno(rc);
}

int vxlan_fdb_replay(const struct net_device *dev, __be32 vni,
617 618
		     struct notifier_block *nb,
		     struct netlink_ext_ack *extack)
P
Petr Machata 已提交
619 620 621 622 623 624 625 626 627 628 629 630
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;
	int rc = 0;

	if (!netif_is_vxlan(dev))
		return -EINVAL;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
631
		spin_lock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
632 633 634 635
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist) {
			if (f->vni == vni) {
				list_for_each_entry(rdst, &f->remotes, list) {
					rc = vxlan_fdb_notify_one(nb, vxlan,
636 637
								  f, rdst,
								  extack);
P
Petr Machata 已提交
638
					if (rc)
639
						goto unlock;
P
Petr Machata 已提交
640 641 642
				}
			}
		}
643
		spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
644
	}
645
	return 0;
P
Petr Machata 已提交
646

647 648
unlock:
	spin_unlock_bh(&vxlan->hash_lock[h]);
P
Petr Machata 已提交
649 650 651 652
	return rc;
}
EXPORT_SYMBOL_GPL(vxlan_fdb_replay);

653 654 655 656 657 658 659 660 661 662 663 664
void vxlan_fdb_clear_offload(const struct net_device *dev, __be32 vni)
{
	struct vxlan_dev *vxlan;
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
	unsigned int h;

	if (!netif_is_vxlan(dev))
		return;
	vxlan = netdev_priv(dev);

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
665
		spin_lock_bh(&vxlan->hash_lock[h]);
666 667 668 669
		hlist_for_each_entry(f, &vxlan->fdb_head[h], hlist)
			if (f->vni == vni)
				list_for_each_entry(rdst, &f->remotes, list)
					rdst->offloaded = false;
670
		spin_unlock_bh(&vxlan->hash_lock[h]);
671
	}
672

673 674 675
}
EXPORT_SYMBOL_GPL(vxlan_fdb_clear_offload);

676 677
/* Replace destination of unicast mac */
static int vxlan_fdb_replace(struct vxlan_fdb *f,
678
			     union vxlan_addr *ip, __be16 port, __be32 vni,
679
			     __u32 ifindex, struct vxlan_rdst *oldrd)
680 681 682 683 684 685 686 687 688 689
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

	rd = list_first_entry_or_null(&f->remotes, struct vxlan_rdst, list);
	if (!rd)
		return 0;
690

691
	*oldrd = *rd;
692
	dst_cache_reset(&rd->dst_cache);
C
Cong Wang 已提交
693
	rd->remote_ip = *ip;
694 695 696
	rd->remote_port = port;
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
697
	rd->offloaded = false;
698 699 700
	return 1;
}

701 702
/* Add/update destinations for multicast */
static int vxlan_fdb_append(struct vxlan_fdb *f,
703
			    union vxlan_addr *ip, __be16 port, __be32 vni,
704
			    __u32 ifindex, struct vxlan_rdst **rdp)
705 706 707 708 709 710 711
{
	struct vxlan_rdst *rd;

	rd = vxlan_fdb_find_rdst(f, ip, port, vni, ifindex);
	if (rd)
		return 0;

712 713 714
	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
	if (rd == NULL)
		return -ENOBUFS;
715 716 717 718 719 720

	if (dst_cache_init(&rd->dst_cache, GFP_ATOMIC)) {
		kfree(rd);
		return -ENOBUFS;
	}

C
Cong Wang 已提交
721
	rd->remote_ip = *ip;
722
	rd->remote_port = port;
723
	rd->offloaded = false;
724 725
	rd->remote_vni = vni;
	rd->remote_ifindex = ifindex;
726 727 728

	list_add_tail_rcu(&rd->list, &f->remotes);

729
	*rdp = rd;
730 731 732
	return 1;
}

T
Tom Herbert 已提交
733 734 735
static struct vxlanhdr *vxlan_gro_remcsum(struct sk_buff *skb,
					  unsigned int off,
					  struct vxlanhdr *vh, size_t hdrlen,
736 737
					  __be32 vni_field,
					  struct gro_remcsum *grc,
738
					  bool nopartial)
T
Tom Herbert 已提交
739
{
740
	size_t start, offset;
T
Tom Herbert 已提交
741 742

	if (skb->remcsum_offload)
743
		return vh;
T
Tom Herbert 已提交
744 745 746 747

	if (!NAPI_GRO_CB(skb)->csum_valid)
		return NULL;

748 749
	start = vxlan_rco_start(vni_field);
	offset = start + vxlan_rco_offset(vni_field);
T
Tom Herbert 已提交
750

751 752
	vh = skb_gro_remcsum_process(skb, (void *)vh, off, hdrlen,
				     start, offset, grc, nopartial);
T
Tom Herbert 已提交
753 754 755 756 757 758

	skb->remcsum_offload = 1;

	return vh;
}

759 760 761
static struct sk_buff *vxlan_gro_receive(struct sock *sk,
					 struct list_head *head,
					 struct sk_buff *skb)
762
{
763 764
	struct sk_buff *pp = NULL;
	struct sk_buff *p;
765
	struct vxlanhdr *vh, *vh2;
766
	unsigned int hlen, off_vx;
767
	int flush = 1;
768
	struct vxlan_sock *vs = rcu_dereference_sk_user_data(sk);
769
	__be32 flags;
770 771 772
	struct gro_remcsum grc;

	skb_gro_remcsum_init(&grc);
773 774 775 776 777 778 779 780 781 782

	off_vx = skb_gro_offset(skb);
	hlen = off_vx + sizeof(*vh);
	vh   = skb_gro_header_fast(skb, off_vx);
	if (skb_gro_header_hard(skb, hlen)) {
		vh = skb_gro_header_slow(skb, hlen, off_vx);
		if (unlikely(!vh))
			goto out;
	}

T
Tom Herbert 已提交
783 784
	skb_gro_postpull_rcsum(skb, vh, sizeof(struct vxlanhdr));

785
	flags = vh->vx_flags;
T
Tom Herbert 已提交
786 787 788

	if ((flags & VXLAN_HF_RCO) && (vs->flags & VXLAN_F_REMCSUM_RX)) {
		vh = vxlan_gro_remcsum(skb, off_vx, vh, sizeof(struct vxlanhdr),
789
				       vh->vx_vni, &grc,
790 791
				       !!(vs->flags &
					  VXLAN_F_REMCSUM_NOPARTIAL));
T
Tom Herbert 已提交
792 793 794 795 796

		if (!vh)
			goto out;
	}

797 798
	skb_gro_pull(skb, sizeof(struct vxlanhdr)); /* pull vxlan header */

799
	list_for_each_entry(p, head, list) {
800 801 802 803
		if (!NAPI_GRO_CB(p)->same_flow)
			continue;

		vh2 = (struct vxlanhdr *)(p->data + off_vx);
T
Thomas Graf 已提交
804 805
		if (vh->vx_flags != vh2->vx_flags ||
		    vh->vx_vni != vh2->vx_vni) {
806 807 808 809 810
			NAPI_GRO_CB(p)->same_flow = 0;
			continue;
		}
	}

S
Sabrina Dubroca 已提交
811
	pp = call_gro_receive(eth_gro_receive, head, skb);
812
	flush = 0;
813 814

out:
815
	skb_gro_flush_final_remcsum(skb, pp, flush, &grc);
816 817 818 819

	return pp;
}

820
static int vxlan_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
821
{
822 823 824
	/* Sets 'skb->inner_mac_header' since we are always called with
	 * 'skb->encapsulation' set.
	 */
825
	return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
826 827
}

828 829 830
static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, const u8 *mac,
					 __u16 state, __be32 src_vni,
					 __u16 ndm_flags)
831 832 833 834 835 836 837 838 839 840
{
	struct vxlan_fdb *f;

	f = kmalloc(sizeof(*f), GFP_ATOMIC);
	if (!f)
		return NULL;
	f->state = state;
	f->flags = ndm_flags;
	f->updated = f->used = jiffies;
	f->vni = src_vni;
841
	f->nh = NULL;
842
	RCU_INIT_POINTER(f->vdev, vxlan);
843
	INIT_LIST_HEAD(&f->nh_list);
844 845 846 847 848 849
	INIT_LIST_HEAD(&f->remotes);
	memcpy(f->eth_addr, mac, ETH_ALEN);

	return f;
}

850 851 852 853 854 855 856 857
static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac,
			     __be32 src_vni, struct vxlan_fdb *f)
{
	++vxlan->addrcnt;
	hlist_add_head_rcu(&f->hlist,
			   vxlan_fdb_head(vxlan, mac, src_vni));
}

858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879
static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb,
			       u32 nhid, struct netlink_ext_ack *extack)
{
	struct nexthop *old_nh = rtnl_dereference(fdb->nh);
	struct nexthop *nh;
	int err = -EINVAL;

	if (old_nh && old_nh->id == nhid)
		return 0;

	nh = nexthop_find_by_id(vxlan->net, nhid);
	if (!nh) {
		NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
		goto err_inval;
	}

	if (nh) {
		if (!nexthop_get(nh)) {
			NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
			nh = NULL;
			goto err_inval;
		}
880
		if (!nexthop_is_fdb(nh)) {
881 882 883 884
			NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop");
			goto err_inval;
		}

885
		if (!nexthop_is_multipath(nh)) {
886 887 888 889 890 891 892
			NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group");
			goto err_inval;
		}

		/* check nexthop group family */
		switch (vxlan->default_dst.remote_ip.sa.sa_family) {
		case AF_INET:
893
			if (!nexthop_has_v4(nh)) {
894 895 896 897 898 899
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
			break;
		case AF_INET6:
900
			if (nexthop_has_v4(nh)) {
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921
				err = -EAFNOSUPPORT;
				NL_SET_ERR_MSG(extack, "Nexthop group family not supported");
				goto err_inval;
			}
		}
	}

	if (old_nh) {
		list_del_rcu(&fdb->nh_list);
		nexthop_put(old_nh);
	}
	rcu_assign_pointer(fdb->nh, nh);
	list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list);
	return 1;

err_inval:
	if (nh)
		nexthop_put(nh);
	return err;
}

S
stephen hemminger 已提交
922
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
923 924
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __be16 port, __be32 src_vni,
P
Petr Machata 已提交
925
			    __be32 vni, __u32 ifindex, __u16 ndm_flags,
926 927
			    u32 nhid, struct vxlan_fdb **fdb,
			    struct netlink_ext_ack *extack)
928 929 930 931 932 933 934 935 936 937
{
	struct vxlan_rdst *rd = NULL;
	struct vxlan_fdb *f;
	int rc;

	if (vxlan->cfg.addrmax &&
	    vxlan->addrcnt >= vxlan->cfg.addrmax)
		return -ENOSPC;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
938
	f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
939 940 941
	if (!f)
		return -ENOMEM;

942 943 944 945 946 947
	if (nhid)
		rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
	else
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
	if (rc < 0)
		goto errout;
948 949 950 951

	*fdb = f;

	return 0;
952 953 954 955

errout:
	kfree(f);
	return rc;
956 957
}

958
static void __vxlan_fdb_free(struct vxlan_fdb *f)
959 960
{
	struct vxlan_rdst *rd, *nd;
961 962 963 964 965
	struct nexthop *nh;

	nh = rcu_dereference_raw(f->nh);
	if (nh) {
		rcu_assign_pointer(f->nh, NULL);
966
		rcu_assign_pointer(f->vdev, NULL);
967 968
		nexthop_put(nh);
	}
969 970 971 972 973 974 975 976

	list_for_each_entry_safe(rd, nd, &f->remotes, list) {
		dst_cache_destroy(&rd->dst_cache);
		kfree(rd);
	}
	kfree(f);
}

977 978 979 980 981 982 983
static void vxlan_fdb_free(struct rcu_head *head)
{
	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);

	__vxlan_fdb_free(f);
}

984 985 986 987 988 989 990 991
static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
			      bool do_notify, bool swdev_notify)
{
	struct vxlan_rdst *rd;

	netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr);

	--vxlan->addrcnt;
992 993 994
	if (do_notify) {
		if (rcu_access_pointer(f->nh))
			vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH,
995
					 swdev_notify, NULL);
996 997 998 999 1000
		else
			list_for_each_entry(rd, &f->remotes, list)
				vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH,
						 swdev_notify, NULL);
	}
1001 1002

	hlist_del_rcu(&f->hlist);
1003
	list_del_rcu(&f->nh_list);
1004 1005 1006
	call_rcu(&f->rcu, vxlan_fdb_free);
}

1007 1008 1009 1010 1011 1012 1013 1014
static void vxlan_dst_free(struct rcu_head *head)
{
	struct vxlan_rdst *rd = container_of(head, struct vxlan_rdst, rcu);

	dst_cache_destroy(&rd->dst_cache);
	kfree(rd);
}

1015 1016 1017 1018 1019
static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan,
				     union vxlan_addr *ip,
				     __u16 state, __u16 flags,
				     __be16 port, __be32 vni,
				     __u32 ifindex, __u16 ndm_flags,
1020
				     struct vxlan_fdb *f, u32 nhid,
1021 1022
				     bool swdev_notify,
				     struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
1023
{
P
Petr Machata 已提交
1024
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
1025
	struct vxlan_rdst *rd = NULL;
1026
	struct vxlan_rdst oldrd;
S
stephen hemminger 已提交
1027
	int notify = 0;
1028 1029
	int rc = 0;
	int err;
S
stephen hemminger 已提交
1030

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
	if (nhid && !rcu_access_pointer(f->nh)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot replace an existing non nexthop fdb with a nexthop");
		return -EOPNOTSUPP;
	}

	if (nhid && (flags & NLM_F_APPEND)) {
		NL_SET_ERR_MSG(extack,
			       "Cannot append to a nexthop fdb");
		return -EOPNOTSUPP;
	}

1043 1044 1045 1046 1047 1048 1049 1050 1051
	/* Do not allow an externally learned entry to take over an entry added
	 * by the user.
	 */
	if (!(fdb_flags & NTF_EXT_LEARNED) ||
	    !(f->flags & NTF_VXLAN_ADDED_BY_USER)) {
		if (f->state != state) {
			f->state = state;
			f->updated = jiffies;
			notify = 1;
1052
		}
1053 1054 1055 1056
		if (f->flags != fdb_flags) {
			f->flags = fdb_flags;
			f->updated = jiffies;
			notify = 1;
1057
		}
1058
	}
1059

1060 1061 1062 1063
	if ((flags & NLM_F_REPLACE)) {
		/* Only change unicasts */
		if (!(is_multicast_ether_addr(f->eth_addr) ||
		      is_zero_ether_addr(f->eth_addr))) {
1064 1065 1066 1067 1068 1069 1070 1071
			if (nhid) {
				rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack);
				if (rc < 0)
					return rc;
			} else {
				rc = vxlan_fdb_replace(f, ip, port, vni,
						       ifindex, &oldrd);
			}
1072
			notify |= rc;
1073
		} else {
1074
			NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries");
1075
			return -EOPNOTSUPP;
1076 1077 1078 1079 1080 1081
		}
	}
	if ((flags & NLM_F_APPEND) &&
	    (is_multicast_ether_addr(f->eth_addr) ||
	     is_zero_ether_addr(f->eth_addr))) {
		rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
1082

1083
		if (rc < 0)
1084
			return rc;
1085
		notify |= rc;
S
stephen hemminger 已提交
1086 1087
	}

1088 1089 1090
	if (ndm_flags & NTF_USE)
		f->used = jiffies;

1091 1092 1093
	if (notify) {
		if (rd == NULL)
			rd = first_remote_rtnl(f);
1094

1095
		err = vxlan_fdb_notify(vxlan, f, rd, RTM_NEWNEIGH,
1096
				       swdev_notify, extack);
1097 1098
		if (err)
			goto err_notify;
1099
	}
S
stephen hemminger 已提交
1100 1101

	return 0;
1102 1103

err_notify:
1104 1105
	if (nhid)
		return err;
1106 1107
	if ((flags & NLM_F_REPLACE) && rc)
		*rd = oldrd;
1108
	else if ((flags & NLM_F_APPEND) && rc) {
1109
		list_del_rcu(&rd->list);
1110 1111
		call_rcu(&rd->rcu, vxlan_dst_free);
	}
1112
	return err;
S
stephen hemminger 已提交
1113 1114
}

1115 1116 1117 1118
static int vxlan_fdb_update_create(struct vxlan_dev *vxlan,
				   const u8 *mac, union vxlan_addr *ip,
				   __u16 state, __u16 flags,
				   __be16 port, __be32 src_vni, __be32 vni,
1119
				   __u32 ifindex, __u16 ndm_flags, u32 nhid,
1120 1121
				   bool swdev_notify,
				   struct netlink_ext_ack *extack)
1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
{
	__u16 fdb_flags = (ndm_flags & ~NTF_USE);
	struct vxlan_fdb *f;
	int rc;

	/* Disallow replace to add a multicast entry */
	if ((flags & NLM_F_REPLACE) &&
	    (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
		return -EOPNOTSUPP;

	netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
	rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
1134
			      vni, ifindex, fdb_flags, nhid, &f, extack);
1135 1136 1137
	if (rc < 0)
		return rc;

1138
	vxlan_fdb_insert(vxlan, mac, src_vni, f);
1139
	rc = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH,
1140
			      swdev_notify, extack);
1141 1142 1143
	if (rc)
		goto err_notify;

1144
	return 0;
1145 1146 1147 1148

err_notify:
	vxlan_fdb_destroy(vxlan, f, false, false);
	return rc;
1149 1150 1151 1152 1153 1154 1155
}

/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
			    const u8 *mac, union vxlan_addr *ip,
			    __u16 state, __u16 flags,
			    __be16 port, __be32 src_vni, __be32 vni,
1156
			    __u32 ifindex, __u16 ndm_flags, u32 nhid,
1157 1158
			    bool swdev_notify,
			    struct netlink_ext_ack *extack)
1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
{
	struct vxlan_fdb *f;

	f = __vxlan_find_mac(vxlan, mac, src_vni);
	if (f) {
		if (flags & NLM_F_EXCL) {
			netdev_dbg(vxlan->dev,
				   "lost race to create %pM\n", mac);
			return -EEXIST;
		}

		return vxlan_fdb_update_existing(vxlan, ip, state, flags, port,
						 vni, ifindex, ndm_flags, f,
1172
						 nhid, swdev_notify, extack);
1173 1174 1175 1176 1177 1178
	} else {
		if (!(flags & NLM_F_CREATE))
			return -ENOENT;

		return vxlan_fdb_update_create(vxlan, mac, ip, state, flags,
					       port, src_vni, vni, ifindex,
1179 1180
					       ndm_flags, nhid, swdev_notify,
					       extack);
1181 1182 1183
	}
}

1184
static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
1185
				  struct vxlan_rdst *rd, bool swdev_notify)
1186 1187
{
	list_del_rcu(&rd->list);
1188
	vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, swdev_notify, NULL);
1189 1190 1191
	call_rcu(&rd->rcu, vxlan_dst_free);
}

M
Mike Rapoport 已提交
1192
static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan,
1193
			   union vxlan_addr *ip, __be16 *port, __be32 *src_vni,
1194
			   __be32 *vni, u32 *ifindex, u32 *nhid)
S
stephen hemminger 已提交
1195
{
1196
	struct net *net = dev_net(vxlan->dev);
C
Cong Wang 已提交
1197
	int err;
S
stephen hemminger 已提交
1198

1199 1200 1201 1202
	if (tb[NDA_NH_ID] && (tb[NDA_DST] || tb[NDA_VNI] || tb[NDA_IFINDEX] ||
	    tb[NDA_PORT]))
		return -EINVAL;

M
Mike Rapoport 已提交
1203
	if (tb[NDA_DST]) {
C
Cong Wang 已提交
1204 1205 1206
		err = vxlan_nla_get_addr(ip, tb[NDA_DST]);
		if (err)
			return err;
M
Mike Rapoport 已提交
1207
	} else {
C
Cong Wang 已提交
1208
		union vxlan_addr *remote = &vxlan->default_dst.remote_ip;
1209

C
Cong Wang 已提交
1210 1211 1212 1213 1214 1215 1216 1217 1218
		if (remote->sa.sa_family == AF_INET) {
			ip->sin.sin_addr.s_addr = htonl(INADDR_ANY);
			ip->sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
			ip->sin6.sin6_addr = in6addr_any;
			ip->sa.sa_family = AF_INET6;
#endif
		}
M
Mike Rapoport 已提交
1219
	}
S
stephen hemminger 已提交
1220

1221
	if (tb[NDA_PORT]) {
1222
		if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
1223
			return -EINVAL;
M
Mike Rapoport 已提交
1224 1225
		*port = nla_get_be16(tb[NDA_PORT]);
	} else {
1226
		*port = vxlan->cfg.dst_port;
M
Mike Rapoport 已提交
1227
	}
1228 1229 1230 1231

	if (tb[NDA_VNI]) {
		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
			return -EINVAL;
1232
		*vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
M
Mike Rapoport 已提交
1233 1234 1235
	} else {
		*vni = vxlan->default_dst.remote_vni;
	}
1236

1237 1238 1239 1240 1241 1242 1243 1244
	if (tb[NDA_SRC_VNI]) {
		if (nla_len(tb[NDA_SRC_VNI]) != sizeof(u32))
			return -EINVAL;
		*src_vni = cpu_to_be32(nla_get_u32(tb[NDA_SRC_VNI]));
	} else {
		*src_vni = vxlan->default_dst.remote_vni;
	}

1245
	if (tb[NDA_IFINDEX]) {
P
Pravin B Shelar 已提交
1246
		struct net_device *tdev;
1247 1248 1249

		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
			return -EINVAL;
M
Mike Rapoport 已提交
1250
		*ifindex = nla_get_u32(tb[NDA_IFINDEX]);
1251
		tdev = __dev_get_by_index(net, *ifindex);
P
Pravin B Shelar 已提交
1252
		if (!tdev)
1253
			return -EADDRNOTAVAIL;
M
Mike Rapoport 已提交
1254 1255 1256 1257
	} else {
		*ifindex = 0;
	}

1258 1259 1260 1261 1262
	if (tb[NDA_NH_ID])
		*nhid = nla_get_u32(tb[NDA_NH_ID]);
	else
		*nhid = 0;

M
Mike Rapoport 已提交
1263 1264 1265 1266 1267 1268
	return 0;
}

/* Add static entry (via netlink) */
static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
			 struct net_device *dev,
1269 1270
			 const unsigned char *addr, u16 vid, u16 flags,
			 struct netlink_ext_ack *extack)
M
Mike Rapoport 已提交
1271 1272 1273
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	/* struct net *net = dev_net(vxlan->dev); */
C
Cong Wang 已提交
1274
	union vxlan_addr ip;
M
Mike Rapoport 已提交
1275
	__be16 port;
1276
	__be32 src_vni, vni;
1277
	u32 ifindex, nhid;
1278
	u32 hash_index;
M
Mike Rapoport 已提交
1279 1280 1281 1282 1283 1284 1285 1286
	int err;

	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
		pr_info("RTM_NEWNEIGH with invalid state %#x\n",
			ndm->ndm_state);
		return -EINVAL;
	}

1287
	if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID]))
M
Mike Rapoport 已提交
1288 1289
		return -EINVAL;

1290 1291
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
M
Mike Rapoport 已提交
1292 1293
	if (err)
		return err;
1294

1295 1296 1297
	if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family)
		return -EAFNOSUPPORT;

1298 1299
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1300
	err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
P
Petr Machata 已提交
1301 1302
			       port, src_vni, vni, ifindex,
			       ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER,
1303
			       nhid, true, extack);
1304
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1305 1306 1307 1308

	return err;
}

1309 1310
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
			      const unsigned char *addr, union vxlan_addr ip,
1311
			      __be16 port, __be32 src_vni, __be32 vni,
1312
			      u32 ifindex, bool swdev_notify)
S
stephen hemminger 已提交
1313
{
1314
	struct vxlan_rdst *rd = NULL;
1315
	struct vxlan_fdb *f;
1316
	int err = -ENOENT;
1317

1318
	f = vxlan_find_mac(vxlan, addr, src_vni);
1319
	if (!f)
1320
		return err;
1321

C
Cong Wang 已提交
1322 1323
	if (!vxlan_addr_any(&ip)) {
		rd = vxlan_fdb_find_rdst(f, &ip, port, vni, ifindex);
1324 1325 1326 1327 1328 1329 1330 1331
		if (!rd)
			goto out;
	}

	/* remove a destination if it's not the only one on the list,
	 * otherwise destroy the fdb entry
	 */
	if (rd && !list_is_singular(&f->remotes)) {
1332
		vxlan_fdb_dst_destroy(vxlan, f, rd, swdev_notify);
1333
		goto out;
S
stephen hemminger 已提交
1334
	}
1335

1336
	vxlan_fdb_destroy(vxlan, f, true, swdev_notify);
1337 1338

out:
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	return 0;
}

/* Delete entry (via netlink) */
static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
			    struct net_device *dev,
			    const unsigned char *addr, u16 vid)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	union vxlan_addr ip;
	__be32 src_vni, vni;
1350
	u32 ifindex, nhid;
1351
	u32 hash_index;
1352
	__be16 port;
1353 1354
	int err;

1355 1356
	err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex,
			      &nhid);
1357 1358 1359
	if (err)
		return err;

1360 1361
	hash_index = fdb_head_index(vxlan, addr, src_vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
1362 1363
	err = __vxlan_fdb_delete(vxlan, addr, ip, port, src_vni, vni, ifindex,
				 true);
1364
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1365 1366 1367 1368 1369 1370

	return err;
}

/* Dump forwarding table */
static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
1371
			  struct net_device *dev,
1372
			  struct net_device *filter_dev, int *idx)
S
stephen hemminger 已提交
1373 1374 1375
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	unsigned int h;
1376
	int err = 0;
S
stephen hemminger 已提交
1377 1378 1379 1380

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct vxlan_fdb *f;

1381
		rcu_read_lock();
1382
		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
1383 1384
			struct vxlan_rdst *rd;

1385
			if (rcu_access_pointer(f->nh)) {
1386 1387
				if (*idx < cb->args[2])
					goto skip_nh;
1388 1389 1390 1391 1392
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, NULL);
1393 1394
				if (err < 0) {
					rcu_read_unlock();
1395
					goto out;
1396
				}
1397 1398
skip_nh:
				*idx += 1;
1399 1400 1401
				continue;
			}

1402
			list_for_each_entry_rcu(rd, &f->remotes, list) {
1403
				if (*idx < cb->args[2])
1404 1405
					goto skip;

1406 1407 1408 1409 1410
				err = vxlan_fdb_info(skb, vxlan, f,
						     NETLINK_CB(cb->skb).portid,
						     cb->nlh->nlmsg_seq,
						     RTM_NEWNEIGH,
						     NLM_F_MULTI, rd);
1411 1412
				if (err < 0) {
					rcu_read_unlock();
1413
					goto out;
1414
				}
1415
skip:
1416
				*idx += 1;
1417
			}
S
stephen hemminger 已提交
1418
		}
1419
		rcu_read_unlock();
S
stephen hemminger 已提交
1420
	}
1421
out:
1422
	return err;
S
stephen hemminger 已提交
1423 1424
}

R
Roopa Prabhu 已提交
1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457
static int vxlan_fdb_get(struct sk_buff *skb,
			 struct nlattr *tb[],
			 struct net_device *dev,
			 const unsigned char *addr,
			 u16 vid, u32 portid, u32 seq,
			 struct netlink_ext_ack *extack)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
	__be32 vni;
	int err;

	if (tb[NDA_VNI])
		vni = cpu_to_be32(nla_get_u32(tb[NDA_VNI]));
	else
		vni = vxlan->default_dst.remote_vni;

	rcu_read_lock();

	f = __vxlan_find_mac(vxlan, addr, vni);
	if (!f) {
		NL_SET_ERR_MSG(extack, "Fdb entry not found");
		err = -ENOENT;
		goto errout;
	}

	err = vxlan_fdb_info(skb, vxlan, f, portid, seq,
			     RTM_NEWNEIGH, 0, first_remote_rcu(f));
errout:
	rcu_read_unlock();
	return err;
}

S
stephen hemminger 已提交
1458 1459
/* Watch incoming packets to learn mapping between Ethernet address
 * and Tunnel endpoint.
1460
 * Return true if packet is bogus and should be dropped.
S
stephen hemminger 已提交
1461
 */
1462
static bool vxlan_snoop(struct net_device *dev,
1463
			union vxlan_addr *src_ip, const u8 *src_mac,
1464
			u32 src_ifindex, __be32 vni)
S
stephen hemminger 已提交
1465 1466 1467
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
1468 1469 1470 1471 1472 1473 1474
	u32 ifindex = 0;

#if IS_ENABLED(CONFIG_IPV6)
	if (src_ip->sa.sa_family == AF_INET6 &&
	    (ipv6_addr_type(&src_ip->sin6.sin6_addr) & IPV6_ADDR_LINKLOCAL))
		ifindex = src_ifindex;
#endif
S
stephen hemminger 已提交
1475

1476
	f = vxlan_find_mac(vxlan, src_mac, vni);
S
stephen hemminger 已提交
1477
	if (likely(f)) {
1478
		struct vxlan_rdst *rdst = first_remote_rcu(f);
1479

1480 1481
		if (likely(vxlan_addr_equal(&rdst->remote_ip, src_ip) &&
			   rdst->remote_ifindex == ifindex))
1482 1483 1484
			return false;

		/* Don't migrate static entries, drop packets */
1485
		if (f->state & (NUD_PERMANENT | NUD_NOARP))
1486
			return true;
S
stephen hemminger 已提交
1487

1488 1489 1490 1491
		/* Don't override an fdb with nexthop with a learnt entry */
		if (rcu_access_pointer(f->nh))
			return true;

S
stephen hemminger 已提交
1492 1493
		if (net_ratelimit())
			netdev_info(dev,
C
Cong Wang 已提交
1494
				    "%pM migrated from %pIS to %pIS\n",
1495
				    src_mac, &rdst->remote_ip.sa, &src_ip->sa);
S
stephen hemminger 已提交
1496

C
Cong Wang 已提交
1497
		rdst->remote_ip = *src_ip;
S
stephen hemminger 已提交
1498
		f->updated = jiffies;
1499
		vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL);
S
stephen hemminger 已提交
1500
	} else {
1501 1502
		u32 hash_index = fdb_head_index(vxlan, src_mac, vni);

S
stephen hemminger 已提交
1503
		/* learned new entry */
1504
		spin_lock(&vxlan->hash_lock[hash_index]);
1505 1506 1507

		/* close off race between vxlan_flush and incoming packets */
		if (netif_running(dev))
1508
			vxlan_fdb_update(vxlan, src_mac, src_ip,
1509 1510
					 NUD_REACHABLE,
					 NLM_F_EXCL|NLM_F_CREATE,
1511
					 vxlan->cfg.dst_port,
1512
					 vni,
1513
					 vxlan->default_dst.remote_vni,
1514
					 ifindex, NTF_SELF, 0, true, NULL);
1515
		spin_unlock(&vxlan->hash_lock[hash_index]);
S
stephen hemminger 已提交
1516
	}
1517 1518

	return false;
S
stephen hemminger 已提交
1519 1520 1521
}

/* See if multicast group is already in use by other ID */
1522
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
S
stephen hemminger 已提交
1523
{
1524
	struct vxlan_dev *vxlan;
1525
	struct vxlan_sock *sock4;
A
Arnd Bergmann 已提交
1526 1527 1528
#if IS_ENABLED(CONFIG_IPV6)
	struct vxlan_sock *sock6;
#endif
1529
	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
S
stephen hemminger 已提交
1530

1531 1532
	sock4 = rtnl_dereference(dev->vn4_sock);

1533 1534 1535
	/* The vxlan_sock is only used by dev, leaving group has
	 * no effect on other vxlan devices.
	 */
1536
	if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
1537
		return false;
1538
#if IS_ENABLED(CONFIG_IPV6)
1539
	sock6 = rtnl_dereference(dev->vn6_sock);
1540
	if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
1541 1542
		return false;
#endif
1543

1544
	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
1545
		if (!netif_running(vxlan->dev) || vxlan == dev)
1546
			continue;
S
stephen hemminger 已提交
1547

1548 1549
		if (family == AF_INET &&
		    rtnl_dereference(vxlan->vn4_sock) != sock4)
1550
			continue;
1551
#if IS_ENABLED(CONFIG_IPV6)
1552 1553
		if (family == AF_INET6 &&
		    rtnl_dereference(vxlan->vn6_sock) != sock6)
1554 1555
			continue;
#endif
1556 1557 1558 1559 1560 1561 1562 1563 1564 1565

		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
				      &dev->default_dst.remote_ip))
			continue;

		if (vxlan->default_dst.remote_ifindex !=
		    dev->default_dst.remote_ifindex)
			continue;

		return true;
1566
	}
S
stephen hemminger 已提交
1567 1568 1569 1570

	return false;
}

1571
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
1572
{
1573
	struct vxlan_net *vn;
1574

1575
	if (!vs)
1576
		return false;
1577
	if (!refcount_dec_and_test(&vs->refcnt))
1578
		return false;
S
stephen hemminger 已提交
1579

1580
	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
1581
	spin_lock(&vn->sock_lock);
1582
	hlist_del_rcu(&vs->hlist);
1583
	udp_tunnel_notify_del_rx_port(vs->sock,
1584 1585
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
1586
				      UDP_TUNNEL_TYPE_VXLAN);
1587 1588
	spin_unlock(&vn->sock_lock);

1589
	return true;
S
stephen hemminger 已提交
1590 1591
}

1592 1593
static void vxlan_sock_release(struct vxlan_dev *vxlan)
{
1594
	struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
1595
#if IS_ENABLED(CONFIG_IPV6)
1596 1597
	struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

1598
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
1599 1600
#endif

1601
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
1602 1603
	synchronize_net();

1604 1605
	vxlan_vs_del_dev(vxlan);

1606 1607 1608
	if (__vxlan_sock_release_prep(sock4)) {
		udp_tunnel_sock_release(sock4->sock);
		kfree(sock4);
1609 1610 1611
	}

#if IS_ENABLED(CONFIG_IPV6)
1612 1613 1614
	if (__vxlan_sock_release_prep(sock6)) {
		udp_tunnel_sock_release(sock6->sock);
		kfree(sock6);
1615
	}
1616 1617 1618
#endif
}

1619
/* Update multicast group membership when first VNI on
1620
 * multicast address is brought up
1621
 */
1622
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1623
{
1624
	struct sock *sk;
C
Cong Wang 已提交
1625 1626
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1627
	int ret = -EINVAL;
S
stephen hemminger 已提交
1628

C
Cong Wang 已提交
1629
	if (ip->sa.sa_family == AF_INET) {
1630
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1631 1632 1633 1634 1635
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1636
		sk = sock4->sock->sk;
1637
		lock_sock(sk);
1638
		ret = ip_mc_join_group(sk, &mreq);
1639
		release_sock(sk);
C
Cong Wang 已提交
1640 1641
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1642 1643 1644
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1645
		lock_sock(sk);
1646 1647
		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
						   &ip->sin6.sin6_addr);
1648
		release_sock(sk);
C
Cong Wang 已提交
1649 1650
#endif
	}
S
stephen hemminger 已提交
1651

1652
	return ret;
S
stephen hemminger 已提交
1653 1654 1655
}

/* Inverse of vxlan_igmp_join when last VNI is brought down */
1656
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
S
stephen hemminger 已提交
1657
{
1658
	struct sock *sk;
C
Cong Wang 已提交
1659 1660
	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
	int ifindex = vxlan->default_dst.remote_ifindex;
1661
	int ret = -EINVAL;
S
stephen hemminger 已提交
1662

C
Cong Wang 已提交
1663
	if (ip->sa.sa_family == AF_INET) {
1664
		struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
C
Cong Wang 已提交
1665 1666 1667 1668 1669
		struct ip_mreqn mreq = {
			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
			.imr_ifindex		= ifindex,
		};

1670
		sk = sock4->sock->sk;
1671
		lock_sock(sk);
1672
		ret = ip_mc_leave_group(sk, &mreq);
1673
		release_sock(sk);
C
Cong Wang 已提交
1674 1675
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1676 1677 1678
		struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);

		sk = sock6->sock->sk;
1679
		lock_sock(sk);
1680 1681
		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
						   &ip->sin6.sin6_addr);
1682
		release_sock(sk);
C
Cong Wang 已提交
1683 1684
#endif
	}
S
stephen hemminger 已提交
1685

1686
	return ret;
S
stephen hemminger 已提交
1687 1688
}

1689 1690
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
			  struct sk_buff *skb, u32 vxflags)
T
Tom Herbert 已提交
1691
{
1692
	size_t start, offset;
T
Tom Herbert 已提交
1693

1694 1695
	if (!(unparsed->vx_flags & VXLAN_HF_RCO) || skb->remcsum_offload)
		goto out;
1696

1697 1698
	start = vxlan_rco_start(unparsed->vx_vni);
	offset = start + vxlan_rco_offset(unparsed->vx_vni);
T
Tom Herbert 已提交
1699

1700
	if (!pskb_may_pull(skb, offset + sizeof(u16)))
J
Jiri Benc 已提交
1701
		return false;
T
Tom Herbert 已提交
1702

J
Jiri Benc 已提交
1703 1704
	skb_remcsum_process(skb, (void *)(vxlan_hdr(skb) + 1), start, offset,
			    !!(vxflags & VXLAN_F_REMCSUM_NOPARTIAL));
1705 1706 1707
out:
	unparsed->vx_flags &= ~VXLAN_HF_RCO;
	unparsed->vx_vni &= VXLAN_VNI_MASK;
J
Jiri Benc 已提交
1708
	return true;
T
Tom Herbert 已提交
1709 1710
}

1711
static void vxlan_parse_gbp_hdr(struct vxlanhdr *unparsed,
1712
				struct sk_buff *skb, u32 vxflags,
1713
				struct vxlan_metadata *md)
1714
{
1715
	struct vxlanhdr_gbp *gbp = (struct vxlanhdr_gbp *)unparsed;
1716
	struct metadata_dst *tun_dst;
1717 1718 1719

	if (!(unparsed->vx_flags & VXLAN_HF_GBP))
		goto out;
1720 1721 1722

	md->gbp = ntohs(gbp->policy_id);

1723
	tun_dst = (struct metadata_dst *)skb_dst(skb);
1724
	if (tun_dst) {
1725
		tun_dst->u.tun_info.key.tun_flags |= TUNNEL_VXLAN_OPT;
1726 1727
		tun_dst->u.tun_info.options_len = sizeof(*md);
	}
1728 1729 1730 1731 1732
	if (gbp->dont_learn)
		md->gbp |= VXLAN_GBP_DONT_LEARN;

	if (gbp->policy_applied)
		md->gbp |= VXLAN_GBP_POLICY_APPLIED;
1733

1734 1735 1736
	/* In flow-based mode, GBP is carried in dst_metadata */
	if (!(vxflags & VXLAN_F_COLLECT_METADATA))
		skb->mark = md->gbp;
1737 1738
out:
	unparsed->vx_flags &= ~VXLAN_GBP_USED_BITS;
1739 1740
}

J
Jiri Benc 已提交
1741
static bool vxlan_parse_gpe_hdr(struct vxlanhdr *unparsed,
J
Jiri Benc 已提交
1742
				__be16 *protocol,
J
Jiri Benc 已提交
1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761
				struct sk_buff *skb, u32 vxflags)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)unparsed;

	/* Need to have Next Protocol set for interfaces in GPE mode. */
	if (!gpe->np_applied)
		return false;
	/* "The initial version is 0. If a receiver does not support the
	 * version indicated it MUST drop the packet.
	 */
	if (gpe->version != 0)
		return false;
	/* "When the O bit is set to 1, the packet is an OAM packet and OAM
	 * processing MUST occur." However, we don't implement OAM
	 * processing, thus drop the packet.
	 */
	if (gpe->oam_flag)
		return false;

1762 1763
	*protocol = tun_p_to_eth_p(gpe->next_protocol);
	if (!*protocol)
J
Jiri Benc 已提交
1764 1765 1766 1767 1768 1769
		return false;

	unparsed->vx_flags &= ~VXLAN_GPE_USED_BITS;
	return true;
}

1770 1771
static bool vxlan_set_mac(struct vxlan_dev *vxlan,
			  struct vxlan_sock *vs,
1772
			  struct sk_buff *skb, __be32 vni)
1773 1774
{
	union vxlan_addr saddr;
1775
	u32 ifindex = skb->dev->ifindex;
1776 1777 1778 1779 1780 1781 1782

	skb_reset_mac_header(skb);
	skb->protocol = eth_type_trans(skb, vxlan->dev);
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);

	/* Ignore packet loops (and multicast echo) */
	if (ether_addr_equal(eth_hdr(skb)->h_source, vxlan->dev->dev_addr))
1783
		return false;
1784

1785
	/* Get address from the outer IP header */
1786
	if (vxlan_get_sk_family(vs) == AF_INET) {
1787
		saddr.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
1788 1789 1790
		saddr.sa.sa_family = AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
1791
		saddr.sin6.sin6_addr = ipv6_hdr(skb)->saddr;
1792 1793 1794 1795
		saddr.sa.sa_family = AF_INET6;
#endif
	}

1796
	if ((vxlan->cfg.flags & VXLAN_F_LEARN) &&
1797
	    vxlan_snoop(skb->dev, &saddr, eth_hdr(skb)->h_source, ifindex, vni))
1798 1799 1800 1801 1802
		return false;

	return true;
}

1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
				  struct sk_buff *skb)
{
	int err = 0;

	if (vxlan_get_sk_family(vs) == AF_INET)
		err = IP_ECN_decapsulate(oiph, skb);
#if IS_ENABLED(CONFIG_IPV6)
	else
		err = IP6_ECN_decapsulate(oiph, skb);
#endif

	if (unlikely(err) && log_ecn_error) {
		if (vxlan_get_sk_family(vs) == AF_INET)
			net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
					     &((struct iphdr *)oiph)->saddr,
					     ((struct iphdr *)oiph)->tos);
		else
			net_info_ratelimited("non-ECT from %pI6\n",
					     &((struct ipv6hdr *)oiph)->saddr);
	}
	return err <= 1;
}

S
stephen hemminger 已提交
1827
/* Callback from net/ipv4/udp.c to receive packets */
1828
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
S
stephen hemminger 已提交
1829
{
1830
	struct vxlan_dev *vxlan;
P
Pravin B Shelar 已提交
1831
	struct vxlan_sock *vs;
1832
	struct vxlanhdr unparsed;
T
Thomas Graf 已提交
1833 1834
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
J
Jiri Benc 已提交
1835
	__be16 protocol = htons(ETH_P_TEB);
J
Jiri Benc 已提交
1836
	bool raw_proto = false;
1837
	void *oiph;
1838
	__be32 vni = 0;
S
stephen hemminger 已提交
1839

J
Jiri Benc 已提交
1840
	/* Need UDP and VXLAN header to be present */
1841
	if (!pskb_may_pull(skb, VXLAN_HLEN))
1842
		goto drop;
S
stephen hemminger 已提交
1843

1844
	unparsed = *vxlan_hdr(skb);
J
Jiri Benc 已提交
1845 1846 1847 1848 1849 1850
	/* VNI flag always required to be set */
	if (!(unparsed.vx_flags & VXLAN_HF_VNI)) {
		netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
			   ntohl(vxlan_hdr(skb)->vx_flags),
			   ntohl(vxlan_hdr(skb)->vx_vni));
		/* Return non vxlan pkt */
1851
		goto drop;
S
stephen hemminger 已提交
1852
	}
J
Jiri Benc 已提交
1853 1854
	unparsed.vx_flags &= ~VXLAN_HF_VNI;
	unparsed.vx_vni &= ~VXLAN_VNI_MASK;
S
stephen hemminger 已提交
1855

1856
	vs = rcu_dereference_sk_user_data(sk);
P
Pravin B Shelar 已提交
1857
	if (!vs)
S
stephen hemminger 已提交
1858 1859
		goto drop;

1860 1861
	vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);

1862
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
1863 1864 1865
	if (!vxlan)
		goto drop;

J
Jiri Benc 已提交
1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876
	/* For backwards compatibility, only allow reserved fields to be
	 * used by VXLAN extensions if explicitly requested.
	 */
	if (vs->flags & VXLAN_F_GPE) {
		if (!vxlan_parse_gpe_hdr(&unparsed, &protocol, skb, vs->flags))
			goto drop;
		raw_proto = true;
	}

	if (__iptunnel_pull_header(skb, VXLAN_HLEN, protocol, raw_proto,
				   !net_eq(vxlan->net, dev_net(vxlan->dev))))
1877
		goto drop;
1878

1879
	if (vs->flags & VXLAN_F_REMCSUM_RX)
1880
		if (unlikely(!vxlan_remcsum(&unparsed, skb, vs->flags)))
1881 1882
			goto drop;

T
Thomas Graf 已提交
1883
	if (vxlan_collect_metadata(vs)) {
1884
		struct metadata_dst *tun_dst;
J
Jiri Benc 已提交
1885

1886
		tun_dst = udp_tun_rx_dst(skb, vxlan_get_sk_family(vs), TUNNEL_KEY,
1887
					 key32_to_tunnel_id(vni), sizeof(*md));
1888

T
Thomas Graf 已提交
1889 1890 1891
		if (!tun_dst)
			goto drop;

1892
		md = ip_tunnel_info_opts(&tun_dst->u.tun_info);
1893 1894

		skb_dst_set(skb, (struct dst_entry *)tun_dst);
T
Thomas Graf 已提交
1895 1896 1897 1898
	} else {
		memset(md, 0, sizeof(*md));
	}

1899
	if (vs->flags & VXLAN_F_GBP)
1900
		vxlan_parse_gbp_hdr(&unparsed, skb, vs->flags, md);
J
Jiri Benc 已提交
1901 1902 1903
	/* Note that GBP and GPE can never be active together. This is
	 * ensured in vxlan_dev_configure.
	 */
T
Thomas Graf 已提交
1904

1905
	if (unparsed.vx_flags || unparsed.vx_vni) {
1906 1907 1908 1909
		/* If there are any unprocessed flags remaining treat
		 * this as a malformed packet. This behavior diverges from
		 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
		 * in reserved fields are to be ignored. The approach here
1910
		 * maintains compatibility with previous stack code, and also
1911 1912 1913
		 * is more robust and provides a little more security in
		 * adding extensions to VXLAN.
		 */
J
Jiri Benc 已提交
1914
		goto drop;
1915 1916
	}

J
Jiri Benc 已提交
1917
	if (!raw_proto) {
1918
		if (!vxlan_set_mac(vxlan, vs, skb, vni))
J
Jiri Benc 已提交
1919 1920
			goto drop;
	} else {
1921
		skb_reset_mac_header(skb);
J
Jiri Benc 已提交
1922 1923 1924
		skb->dev = vxlan->dev;
		skb->pkt_type = PACKET_HOST;
	}
1925 1926 1927 1928 1929 1930 1931 1932 1933 1934

	oiph = skb_network_header(skb);
	skb_reset_network_header(skb);

	if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
		++vxlan->dev->stats.rx_frame_errors;
		++vxlan->dev->stats.rx_errors;
		goto drop;
	}

1935 1936 1937 1938 1939 1940 1941 1942
	rcu_read_lock();

	if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
		rcu_read_unlock();
		atomic_long_inc(&vxlan->dev->rx_dropped);
		goto drop;
	}

1943
	dev_sw_netstats_rx_add(vxlan->dev, skb->len);
1944
	gro_cells_receive(&vxlan->gro_cells, skb);
1945 1946 1947

	rcu_read_unlock();

P
Pravin B Shelar 已提交
1948 1949 1950
	return 0;

drop:
J
Jiri Benc 已提交
1951 1952 1953
	/* Consume bad packet */
	kfree_skb(skb);
	return 0;
P
Pravin B Shelar 已提交
1954 1955
}

S
Stefano Brivio 已提交
1956 1957 1958 1959 1960 1961 1962 1963
/* Callback from net/ipv{4,6}/udp.c to check that we have a VNI for errors */
static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan;
	struct vxlan_sock *vs;
	struct vxlanhdr *hdr;
	__be32 vni;

1964
	if (!pskb_may_pull(skb, skb_transport_offset(skb) + VXLAN_HLEN))
S
Stefano Brivio 已提交
1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983
		return -EINVAL;

	hdr = vxlan_hdr(skb);

	if (!(hdr->vx_flags & VXLAN_HF_VNI))
		return -EINVAL;

	vs = rcu_dereference_sk_user_data(sk);
	if (!vs)
		return -ENOENT;

	vni = vxlan_vni(hdr->vx_vni);
	vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
	if (!vxlan)
		return -ENOENT;

	return 0;
}

1984
static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
D
David Stevens 已提交
1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct arphdr *parp;
	u8 *arpptr, *sha;
	__be32 sip, tip;
	struct neighbour *n;

	if (dev->flags & IFF_NOARP)
		goto out;

	if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
		dev->stats.tx_dropped++;
		goto out;
	}
	parp = arp_hdr(skb);

	if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
	     parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
	    parp->ar_pro != htons(ETH_P_IP) ||
	    parp->ar_op != htons(ARPOP_REQUEST) ||
	    parp->ar_hln != dev->addr_len ||
	    parp->ar_pln != 4)
		goto out;
	arpptr = (u8 *)parp + sizeof(struct arphdr);
	sha = arpptr;
	arpptr += dev->addr_len;	/* sha */
	memcpy(&sip, arpptr, sizeof(sip));
	arpptr += sizeof(sip);
	arpptr += dev->addr_len;	/* tha */
	memcpy(&tip, arpptr, sizeof(tip));

	if (ipv4_is_loopback(tip) ||
	    ipv4_is_multicast(tip))
		goto out;

	n = neigh_lookup(&arp_tbl, &tip, dev);

	if (n) {
		struct vxlan_fdb *f;
		struct sk_buff	*reply;

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2031
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2032
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
D
David Stevens 已提交
2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

		reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
				n->ha, sha);

		neigh_release(n);

2043 2044 2045
		if (reply == NULL)
			goto out;

D
David Stevens 已提交
2046 2047 2048 2049 2050 2051 2052
		skb_reset_mac_header(reply);
		__skb_pull(reply, skb_network_offset(reply));
		reply->ip_summed = CHECKSUM_UNNECESSARY;
		reply->pkt_type = PACKET_HOST;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;
2053
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
C
Cong Wang 已提交
2054 2055
		union vxlan_addr ipa = {
			.sin.sin_addr.s_addr = tip,
2056
			.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2057 2058 2059 2060
		};

		vxlan_ip_miss(dev, &ipa);
	}
D
David Stevens 已提交
2061 2062 2063 2064 2065
out:
	consume_skb(skb);
	return NETDEV_TX_OK;
}

C
Cong Wang 已提交
2066
#if IS_ENABLED(CONFIG_IPV6)
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078
static struct sk_buff *vxlan_na_create(struct sk_buff *request,
	struct neighbour *n, bool isrouter)
{
	struct net_device *dev = request->dev;
	struct sk_buff *reply;
	struct nd_msg *ns, *na;
	struct ipv6hdr *pip6;
	u8 *daddr;
	int na_olen = 8; /* opt hdr + ETH_ALEN for target */
	int ns_olen;
	int i, len;

2079
	if (dev == NULL || !pskb_may_pull(request, request->len))
2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091
		return NULL;

	len = LL_RESERVED_SPACE(dev) + sizeof(struct ipv6hdr) +
		sizeof(*na) + na_olen + dev->needed_tailroom;
	reply = alloc_skb(len, GFP_ATOMIC);
	if (reply == NULL)
		return NULL;

	reply->protocol = htons(ETH_P_IPV6);
	reply->dev = dev;
	skb_reserve(reply, LL_RESERVED_SPACE(request->dev));
	skb_push(reply, sizeof(struct ethhdr));
2092
	skb_reset_mac_header(reply);
2093

2094
	ns = (struct nd_msg *)(ipv6_hdr(request) + 1);
2095 2096

	daddr = eth_hdr(request)->h_source;
2097 2098
	ns_olen = request->len - skb_network_offset(request) -
		sizeof(struct ipv6hdr) - sizeof(*ns);
2099
	for (i = 0; i < ns_olen-1; i += (ns->opt[i+1]<<3)) {
2100 2101 2102 2103
		if (!ns->opt[i + 1]) {
			kfree_skb(reply);
			return NULL;
		}
2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116
		if (ns->opt[i] == ND_OPT_SOURCE_LL_ADDR) {
			daddr = ns->opt + i + sizeof(struct nd_opt_hdr);
			break;
		}
	}

	/* Ethernet header */
	ether_addr_copy(eth_hdr(reply)->h_dest, daddr);
	ether_addr_copy(eth_hdr(reply)->h_source, n->ha);
	eth_hdr(reply)->h_proto = htons(ETH_P_IPV6);
	reply->protocol = htons(ETH_P_IPV6);

	skb_pull(reply, sizeof(struct ethhdr));
2117
	skb_reset_network_header(reply);
2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131
	skb_put(reply, sizeof(struct ipv6hdr));

	/* IPv6 header */

	pip6 = ipv6_hdr(reply);
	memset(pip6, 0, sizeof(struct ipv6hdr));
	pip6->version = 6;
	pip6->priority = ipv6_hdr(request)->priority;
	pip6->nexthdr = IPPROTO_ICMPV6;
	pip6->hop_limit = 255;
	pip6->daddr = ipv6_hdr(request)->saddr;
	pip6->saddr = *(struct in6_addr *)n->primary_key;

	skb_pull(reply, sizeof(struct ipv6hdr));
2132
	skb_reset_transport_header(reply);
2133 2134

	/* Neighbor Advertisement */
2135
	na = skb_put_zero(reply, sizeof(*na) + na_olen);
2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157
	na->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT;
	na->icmph.icmp6_router = isrouter;
	na->icmph.icmp6_override = 1;
	na->icmph.icmp6_solicited = 1;
	na->target = ns->target;
	ether_addr_copy(&na->opt[2], n->ha);
	na->opt[0] = ND_OPT_TARGET_LL_ADDR;
	na->opt[1] = na_olen >> 3;

	na->icmph.icmp6_cksum = csum_ipv6_magic(&pip6->saddr,
		&pip6->daddr, sizeof(*na)+na_olen, IPPROTO_ICMPV6,
		csum_partial(na, sizeof(*na)+na_olen, 0));

	pip6->payload_len = htons(sizeof(*na)+na_olen);

	skb_push(reply, sizeof(struct ipv6hdr));

	reply->ip_summed = CHECKSUM_UNNECESSARY;

	return reply;
}

2158
static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
C
Cong Wang 已提交
2159 2160
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2161
	const struct in6_addr *daddr;
2162
	const struct ipv6hdr *iphdr;
2163
	struct inet6_dev *in6_dev;
2164 2165
	struct neighbour *n;
	struct nd_msg *msg;
C
Cong Wang 已提交
2166

2167
	rcu_read_lock();
C
Cong Wang 已提交
2168 2169 2170 2171 2172 2173
	in6_dev = __in6_dev_get(dev);
	if (!in6_dev)
		goto out;

	iphdr = ipv6_hdr(skb);
	daddr = &iphdr->daddr;
2174
	msg = (struct nd_msg *)(iphdr + 1);
C
Cong Wang 已提交
2175

2176 2177 2178 2179 2180
	if (ipv6_addr_loopback(daddr) ||
	    ipv6_addr_is_multicast(&msg->target))
		goto out;

	n = neigh_lookup(ipv6_stub->nd_tbl, &msg->target, dev);
C
Cong Wang 已提交
2181 2182 2183

	if (n) {
		struct vxlan_fdb *f;
2184
		struct sk_buff *reply;
C
Cong Wang 已提交
2185 2186 2187 2188 2189 2190

		if (!(n->nud_state & NUD_CONNECTED)) {
			neigh_release(n);
			goto out;
		}

2191
		f = vxlan_find_mac(vxlan, n->ha, vni);
C
Cong Wang 已提交
2192 2193 2194 2195 2196 2197
		if (f && vxlan_addr_any(&(first_remote_rcu(f)->remote_ip))) {
			/* bridge-local neighbor */
			neigh_release(n);
			goto out;
		}

2198 2199 2200
		reply = vxlan_na_create(skb, n,
					!!(f ? f->flags & NTF_ROUTER : 0));

C
Cong Wang 已提交
2201
		neigh_release(n);
2202 2203 2204 2205 2206 2207 2208

		if (reply == NULL)
			goto out;

		if (netif_rx_ni(reply) == NET_RX_DROP)
			dev->stats.rx_dropped++;

2209
	} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
2210 2211
		union vxlan_addr ipa = {
			.sin6.sin6_addr = msg->target,
2212
			.sin6.sin6_family = AF_INET6,
2213 2214
		};

C
Cong Wang 已提交
2215 2216 2217 2218
		vxlan_ip_miss(dev, &ipa);
	}

out:
2219
	rcu_read_unlock();
C
Cong Wang 已提交
2220 2221 2222 2223 2224
	consume_skb(skb);
	return NETDEV_TX_OK;
}
#endif

D
David Stevens 已提交
2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235
static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct neighbour *n;

	if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
		return false;

	n = NULL;
	switch (ntohs(eth_hdr(skb)->h_proto)) {
	case ETH_P_IP:
2236 2237 2238
	{
		struct iphdr *pip;

D
David Stevens 已提交
2239 2240 2241 2242
		if (!pskb_may_pull(skb, sizeof(struct iphdr)))
			return false;
		pip = ip_hdr(skb);
		n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
2243
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
C
Cong Wang 已提交
2244 2245
			union vxlan_addr ipa = {
				.sin.sin_addr.s_addr = pip->daddr,
2246
				.sin.sin_family = AF_INET,
C
Cong Wang 已提交
2247 2248 2249 2250 2251 2252
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

D
David Stevens 已提交
2253
		break;
2254 2255 2256 2257 2258 2259 2260 2261 2262 2263
	}
#if IS_ENABLED(CONFIG_IPV6)
	case ETH_P_IPV6:
	{
		struct ipv6hdr *pip6;

		if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
			return false;
		pip6 = ipv6_hdr(skb);
		n = neigh_lookup(ipv6_stub->nd_tbl, &pip6->daddr, dev);
2264
		if (!n && (vxlan->cfg.flags & VXLAN_F_L3MISS)) {
2265 2266
			union vxlan_addr ipa = {
				.sin6.sin6_addr = pip6->daddr,
2267
				.sin6.sin6_family = AF_INET6,
2268 2269 2270 2271 2272 2273 2274 2275 2276
			};

			vxlan_ip_miss(dev, &ipa);
			return false;
		}

		break;
	}
#endif
D
David Stevens 已提交
2277 2278 2279 2280 2281 2282 2283
	default:
		return false;
	}

	if (n) {
		bool diff;

2284
		diff = !ether_addr_equal(eth_hdr(skb)->h_dest, n->ha);
D
David Stevens 已提交
2285 2286 2287 2288 2289 2290 2291
		if (diff) {
			memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
				dev->addr_len);
			memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
		}
		neigh_release(n);
		return diff;
C
Cong Wang 已提交
2292 2293
	}

D
David Stevens 已提交
2294 2295 2296
	return false;
}

2297
static void vxlan_build_gbp_hdr(struct vxlanhdr *vxh, u32 vxflags,
T
Thomas Graf 已提交
2298 2299 2300 2301
				struct vxlan_metadata *md)
{
	struct vxlanhdr_gbp *gbp;

2302 2303 2304
	if (!md->gbp)
		return;

T
Thomas Graf 已提交
2305
	gbp = (struct vxlanhdr_gbp *)vxh;
2306
	vxh->vx_flags |= VXLAN_HF_GBP;
T
Thomas Graf 已提交
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316

	if (md->gbp & VXLAN_GBP_DONT_LEARN)
		gbp->dont_learn = 1;

	if (md->gbp & VXLAN_GBP_POLICY_APPLIED)
		gbp->policy_applied = 1;

	gbp->policy_id = htons(md->gbp & VXLAN_GBP_ID_MASK);
}

J
Jiri Benc 已提交
2317 2318 2319 2320 2321 2322
static int vxlan_build_gpe_hdr(struct vxlanhdr *vxh, u32 vxflags,
			       __be16 protocol)
{
	struct vxlanhdr_gpe *gpe = (struct vxlanhdr_gpe *)vxh;

	gpe->np_applied = 1;
2323 2324 2325 2326
	gpe->next_protocol = tun_p_from_eth_p(protocol);
	if (!gpe->next_protocol)
		return -EPFNOSUPPORT;
	return 0;
J
Jiri Benc 已提交
2327 2328
}

2329 2330 2331
static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
			   int iphdr_len, __be32 vni,
			   struct vxlan_metadata *md, u32 vxflags,
2332
			   bool udp_sum)
C
Cong Wang 已提交
2333 2334 2335 2336
{
	struct vxlanhdr *vxh;
	int min_headroom;
	int err;
T
Tom Herbert 已提交
2337
	int type = udp_sum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
J
Jiri Benc 已提交
2338
	__be16 inner_protocol = htons(ETH_P_TEB);
T
Tom Herbert 已提交
2339

2340
	if ((vxflags & VXLAN_F_REMCSUM_TX) &&
T
Tom Herbert 已提交
2341 2342 2343 2344 2345 2346
	    skb->ip_summed == CHECKSUM_PARTIAL) {
		int csum_start = skb_checksum_start_offset(skb);

		if (csum_start <= VXLAN_MAX_REMCSUM_START &&
		    !(csum_start & VXLAN_RCO_SHIFT_MASK) &&
		    (skb->csum_offset == offsetof(struct udphdr, check) ||
2347
		     skb->csum_offset == offsetof(struct tcphdr, check)))
T
Tom Herbert 已提交
2348 2349
			type |= SKB_GSO_TUNNEL_REMCSUM;
	}
C
Cong Wang 已提交
2350 2351

	min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
2352
			+ VXLAN_HLEN + iphdr_len;
2353 2354 2355

	/* Need space for new headers (invalidates iph ptr) */
	err = skb_cow_head(skb, min_headroom);
J
Jiri Benc 已提交
2356
	if (unlikely(err))
P
pravin shelar 已提交
2357
		return err;
2358

2359 2360
	err = iptunnel_handle_offloads(skb, type);
	if (err)
P
pravin shelar 已提交
2361
		return err;
2362

2363
	vxh = __skb_push(skb, sizeof(*vxh));
2364 2365
	vxh->vx_flags = VXLAN_HF_VNI;
	vxh->vx_vni = vxlan_vni_field(vni);
2366

T
Tom Herbert 已提交
2367
	if (type & SKB_GSO_TUNNEL_REMCSUM) {
2368
		unsigned int start;
T
Tom Herbert 已提交
2369

2370 2371 2372
		start = skb_checksum_start_offset(skb) - sizeof(struct vxlanhdr);
		vxh->vx_vni |= vxlan_compute_rco(start, skb->csum_offset);
		vxh->vx_flags |= VXLAN_HF_RCO;
T
Tom Herbert 已提交
2373 2374 2375 2376 2377 2378 2379

		if (!skb_is_gso(skb)) {
			skb->ip_summed = CHECKSUM_NONE;
			skb->encapsulation = 0;
		}
	}

2380 2381
	if (vxflags & VXLAN_F_GBP)
		vxlan_build_gbp_hdr(vxh, vxflags, md);
J
Jiri Benc 已提交
2382 2383 2384
	if (vxflags & VXLAN_F_GPE) {
		err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
		if (err < 0)
P
pravin shelar 已提交
2385
			return err;
J
Jiri Benc 已提交
2386 2387
		inner_protocol = skb->protocol;
	}
T
Thomas Graf 已提交
2388

J
Jiri Benc 已提交
2389
	skb_set_inner_protocol(skb, inner_protocol);
2390
	return 0;
2391 2392
}

2393 2394
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
				      struct vxlan_sock *sock4,
2395
				      struct sk_buff *skb, int oif, u8 tos,
2396
				      __be32 daddr, __be32 *saddr, __be16 dport, __be16 sport,
2397
				      struct dst_cache *dst_cache,
2398
				      const struct ip_tunnel_info *info)
2399
{
2400
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2401 2402 2403
	struct rtable *rt = NULL;
	struct flowi4 fl4;

2404 2405 2406
	if (!sock4)
		return ERR_PTR(-EIO);

2407 2408 2409
	if (tos && !info)
		use_cache = false;
	if (use_cache) {
2410 2411 2412 2413 2414
		rt = dst_cache_get_ip4(dst_cache, saddr);
		if (rt)
			return rt;
	}

2415 2416 2417 2418 2419 2420
	memset(&fl4, 0, sizeof(fl4));
	fl4.flowi4_oif = oif;
	fl4.flowi4_tos = RT_TOS(tos);
	fl4.flowi4_mark = skb->mark;
	fl4.flowi4_proto = IPPROTO_UDP;
	fl4.daddr = daddr;
2421
	fl4.saddr = *saddr;
2422 2423
	fl4.fl4_dport = dport;
	fl4.fl4_sport = sport;
2424 2425

	rt = ip_route_output_key(vxlan->net, &fl4);
2426
	if (!IS_ERR(rt)) {
2427 2428 2429 2430 2431 2432
		if (rt->dst.dev == dev) {
			netdev_dbg(dev, "circular route to %pI4\n", &daddr);
			ip_rt_put(rt);
			return ERR_PTR(-ELOOP);
		}

2433
		*saddr = fl4.saddr;
2434 2435
		if (use_cache)
			dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2436 2437 2438
	} else {
		netdev_dbg(dev, "no route to %pI4\n", &daddr);
		return ERR_PTR(-ENETUNREACH);
2439
	}
2440 2441 2442
	return rt;
}

2443 2444
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
2445
					  struct net_device *dev,
2446
					  struct vxlan_sock *sock6,
2447
					  struct sk_buff *skb, int oif, u8 tos,
2448
					  __be32 label,
2449
					  const struct in6_addr *daddr,
2450
					  struct in6_addr *saddr,
2451
					  __be16 dport, __be16 sport,
2452 2453
					  struct dst_cache *dst_cache,
					  const struct ip_tunnel_info *info)
2454
{
2455
	bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
2456 2457 2458
	struct dst_entry *ndst;
	struct flowi6 fl6;

2459 2460 2461
	if (!sock6)
		return ERR_PTR(-EIO);

2462 2463
	if (tos && !info)
		use_cache = false;
2464
	if (use_cache) {
2465 2466 2467 2468 2469
		ndst = dst_cache_get_ip6(dst_cache, saddr);
		if (ndst)
			return ndst;
	}

2470 2471 2472
	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
	fl6.daddr = *daddr;
2473
	fl6.saddr = *saddr;
2474
	fl6.flowlabel = ip6_make_flowinfo(RT_TOS(tos), label);
2475 2476
	fl6.flowi6_mark = skb->mark;
	fl6.flowi6_proto = IPPROTO_UDP;
2477 2478
	fl6.fl6_dport = dport;
	fl6.fl6_sport = sport;
2479

2480 2481
	ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk,
					       &fl6, NULL);
2482
	if (IS_ERR(ndst)) {
2483 2484 2485 2486 2487 2488 2489 2490 2491
		netdev_dbg(dev, "no route to %pI6\n", daddr);
		return ERR_PTR(-ENETUNREACH);
	}

	if (unlikely(ndst->dev == dev)) {
		netdev_dbg(dev, "circular route to %pI6\n", daddr);
		dst_release(ndst);
		return ERR_PTR(-ELOOP);
	}
2492 2493

	*saddr = fl6.saddr;
2494
	if (use_cache)
2495
		dst_cache_set_ip6(dst_cache, ndst, saddr);
2496 2497 2498 2499
	return ndst;
}
#endif

2500 2501
/* Bypass encapsulation if the destination is local */
static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
2502 2503
			       struct vxlan_dev *dst_vxlan, __be32 vni,
			       bool snoop)
2504
{
2505
	struct pcpu_sw_netstats *tx_stats, *rx_stats;
C
Cong Wang 已提交
2506 2507
	union vxlan_addr loopback;
	union vxlan_addr *remote_ip = &dst_vxlan->default_dst.remote_ip;
2508
	struct net_device *dev;
2509
	int len = skb->len;
2510

2511 2512
	tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
	rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
2513 2514 2515 2516 2517
	skb->pkt_type = PACKET_HOST;
	skb->encapsulation = 0;
	skb->dev = dst_vxlan->dev;
	__skb_pull(skb, skb_network_offset(skb));

C
Cong Wang 已提交
2518 2519 2520 2521 2522 2523 2524 2525 2526 2527
	if (remote_ip->sa.sa_family == AF_INET) {
		loopback.sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
		loopback.sa.sa_family =  AF_INET;
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		loopback.sin6.sin6_addr = in6addr_loopback;
		loopback.sa.sa_family =  AF_INET6;
#endif
	}

2528 2529 2530 2531 2532 2533 2534
	rcu_read_lock();
	dev = skb->dev;
	if (unlikely(!(dev->flags & IFF_UP))) {
		kfree_skb(skb);
		goto drop;
	}

2535
	if ((dst_vxlan->cfg.flags & VXLAN_F_LEARN) && snoop)
2536
		vxlan_snoop(dev, &loopback, eth_hdr(skb)->h_source, 0, vni);
2537 2538 2539

	u64_stats_update_begin(&tx_stats->syncp);
	tx_stats->tx_packets++;
2540
	tx_stats->tx_bytes += len;
2541 2542 2543 2544 2545
	u64_stats_update_end(&tx_stats->syncp);

	if (netif_rx(skb) == NET_RX_SUCCESS) {
		u64_stats_update_begin(&rx_stats->syncp);
		rx_stats->rx_packets++;
2546
		rx_stats->rx_bytes += len;
2547 2548
		u64_stats_update_end(&rx_stats->syncp);
	} else {
2549
drop:
2550
		dev->stats.rx_dropped++;
2551
	}
2552
	rcu_read_unlock();
2553 2554
}

2555
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
2556 2557 2558 2559
				 struct vxlan_dev *vxlan,
				 union vxlan_addr *daddr,
				 __be16 dst_port, int dst_ifindex, __be32 vni,
				 struct dst_entry *dst,
2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574
				 u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
	/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
	 * RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
	 * we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
	 */
	BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
	/* Bypass encapsulation if the destination is local */
	if (rt_flags & RTCF_LOCAL &&
	    !(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
		struct vxlan_dev *dst_vxlan;

		dst_release(dst);
2575
		dst_vxlan = vxlan_find_vni(vxlan->net, dst_ifindex, vni,
2576
					   daddr->sa.sa_family, dst_port,
2577
					   vxlan->cfg.flags);
2578 2579 2580 2581 2582 2583
		if (!dst_vxlan) {
			dev->stats.tx_errors++;
			kfree_skb(skb);

			return -ENOENT;
		}
2584
		vxlan_encap_bypass(skb, vxlan, dst_vxlan, vni, true);
2585 2586 2587 2588 2589 2590
		return 1;
	}

	return 0;
}

2591
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
2592 2593
			   __be32 default_vni, struct vxlan_rdst *rdst,
			   bool did_rsc)
S
stephen hemminger 已提交
2594
{
2595
	struct dst_cache *dst_cache;
2596
	struct ip_tunnel_info *info;
S
stephen hemminger 已提交
2597
	struct vxlan_dev *vxlan = netdev_priv(dev);
P
pravin shelar 已提交
2598
	const struct iphdr *old_iph = ip_hdr(skb);
C
Cong Wang 已提交
2599
	union vxlan_addr *dst;
2600
	union vxlan_addr remote_ip, local_ip;
T
Thomas Graf 已提交
2601 2602
	struct vxlan_metadata _md;
	struct vxlan_metadata *md = &_md;
C
Cong Wang 已提交
2603
	__be16 src_port = 0, dst_port;
2604
	struct dst_entry *ndst = NULL;
2605
	__be32 vni, label;
S
stephen hemminger 已提交
2606
	__u8 tos, ttl;
2607
	int ifindex;
2608
	int err;
2609
	u32 flags = vxlan->cfg.flags;
2610
	bool udp_sum = false;
2611
	bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
S
stephen hemminger 已提交
2612

2613
	info = skb_tunnel_info(skb);
2614

T
Thomas Graf 已提交
2615
	if (rdst) {
P
pravin shelar 已提交
2616 2617 2618 2619
		dst = &rdst->remote_ip;
		if (vxlan_addr_any(dst)) {
			if (did_rsc) {
				/* short-circuited back to local bridge */
2620 2621
				vxlan_encap_bypass(skb, vxlan, vxlan,
						   default_vni, true);
P
pravin shelar 已提交
2622 2623 2624 2625 2626
				return;
			}
			goto drop;
		}

2627
		dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
2628
		vni = (rdst->remote_vni) ? : default_vni;
2629
		ifindex = rdst->remote_ifindex;
2630
		local_ip = vxlan->cfg.saddr;
2631
		dst_cache = &rdst->dst_cache;
P
pravin shelar 已提交
2632
		md->gbp = skb->mark;
H
Hangbin Liu 已提交
2633 2634 2635 2636 2637 2638 2639
		if (flags & VXLAN_F_TTL_INHERIT) {
			ttl = ip_tunnel_get_ttl(old_iph, skb);
		} else {
			ttl = vxlan->cfg.ttl;
			if (!ttl && vxlan_addr_multicast(dst))
				ttl = 1;
		}
P
pravin shelar 已提交
2640 2641 2642 2643 2644 2645 2646 2647 2648 2649

		tos = vxlan->cfg.tos;
		if (tos == 1)
			tos = ip_tunnel_get_dsfield(old_iph, skb);

		if (dst->sa.sa_family == AF_INET)
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
		else
			udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
		label = vxlan->cfg.label;
T
Thomas Graf 已提交
2650
	} else {
2651 2652 2653 2654 2655
		if (!info) {
			WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
				  dev->name);
			goto drop;
		}
2656
		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
2657
		if (remote_ip.sa.sa_family == AF_INET) {
2658
			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
2659 2660
			local_ip.sin.sin_addr.s_addr = info->key.u.ipv4.src;
		} else {
2661
			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
2662 2663
			local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
		}
T
Thomas Graf 已提交
2664
		dst = &remote_ip;
P
pravin shelar 已提交
2665 2666
		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
		vni = tunnel_id_to_key32(info->key.tun_id);
2667
		ifindex = 0;
2668
		dst_cache = &info->dst_cache;
2669 2670 2671
		if (info->key.tun_flags & TUNNEL_VXLAN_OPT) {
			if (info->options_len < sizeof(*md))
				goto drop;
P
pravin shelar 已提交
2672
			md = ip_tunnel_info_opts(info);
2673
		}
2674 2675
		ttl = info->key.ttl;
		tos = info->key.tos;
2676
		label = info->key.label;
2677
		udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
2678
	}
P
pravin shelar 已提交
2679 2680
	src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				     vxlan->cfg.port_max, true);
2681

J
Jakub Kicinski 已提交
2682
	rcu_read_lock();
C
Cong Wang 已提交
2683
	if (dst->sa.sa_family == AF_INET) {
2684
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
P
pravin shelar 已提交
2685
		struct rtable *rt;
P
pravin shelar 已提交
2686
		__be16 df = 0;
2687

2688 2689 2690
		if (!ifindex)
			ifindex = sock4->sock->sk->sk_bound_dev_if;

2691
		rt = vxlan_get_route(vxlan, dev, sock4, skb, ifindex, tos,
2692
				     dst->sin.sin_addr.s_addr,
2693
				     &local_ip.sin.sin_addr.s_addr,
2694
				     dst_port, src_port,
2695
				     dst_cache, info);
2696 2697
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
P
pravin shelar 已提交
2698
			goto tx_error;
2699
		}
C
Cong Wang 已提交
2700

2701
		if (!info) {
2702
			/* Bypass encapsulation if the destination is local */
2703
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2704 2705
						    dst_port, ifindex, vni,
						    &rt->dst, rt->rt_flags);
2706
			if (err)
J
Jakub Kicinski 已提交
2707
				goto out_unlock;
2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718

			if (vxlan->cfg.df == VXLAN_DF_SET) {
				df = htons(IP_DF);
			} else if (vxlan->cfg.df == VXLAN_DF_INHERIT) {
				struct ethhdr *eth = eth_hdr(skb);

				if (ntohs(eth->h_proto) == ETH_P_IPV6 ||
				    (ntohs(eth->h_proto) == ETH_P_IP &&
				     old_iph->frag_off & htons(IP_DF)))
					df = htons(IP_DF);
			}
2719
		} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
2720
			df = htons(IP_DF);
2721
		}
2722

P
pravin shelar 已提交
2723
		ndst = &rt->dst;
2724 2725 2726 2727 2728 2729
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
2730
				struct ip_tunnel_info *unclone;
2731 2732
				struct in_addr src, dst;

2733 2734 2735 2736
				unclone = skb_tunnel_info_unclone(skb);
				if (unlikely(!unclone))
					goto tx_error;

2737 2738
				src = remote_ip.sin.sin_addr;
				dst = local_ip.sin.sin_addr;
2739 2740
				unclone->key.u.ipv4.src = src.s_addr;
				unclone->key.u.ipv4.dst = dst.s_addr;
2741 2742 2743 2744 2745
			}
			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}
X
Xin Long 已提交
2746

2747
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
C
Cong Wang 已提交
2748
		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
P
pravin shelar 已提交
2749
		err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
2750
				      vni, md, flags, udp_sum);
2751
		if (err < 0)
P
pravin shelar 已提交
2752
			goto tx_error;
2753

2754
		udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, local_ip.sin.sin_addr.s_addr,
2755 2756
				    dst->sin.sin_addr.s_addr, tos, ttl, df,
				    src_port, dst_port, xnet, !udp_sum);
C
Cong Wang 已提交
2757 2758
#if IS_ENABLED(CONFIG_IPV6)
	} else {
2759
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
C
Cong Wang 已提交
2760

2761 2762 2763
		if (!ifindex)
			ifindex = sock6->sock->sk->sk_bound_dev_if;

2764
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, ifindex, tos,
2765
					label, &dst->sin6.sin6_addr,
2766
					&local_ip.sin6.sin6_addr,
2767
					dst_port, src_port,
2768
					dst_cache, info);
2769
		if (IS_ERR(ndst)) {
2770
			err = PTR_ERR(ndst);
P
pravin shelar 已提交
2771
			ndst = NULL;
2772
			goto tx_error;
C
Cong Wang 已提交
2773
		}
2774

2775 2776
		if (!info) {
			u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
2777

2778
			err = encap_bypass_if_local(skb, dev, vxlan, dst,
2779 2780
						    dst_port, ifindex, vni,
						    ndst, rt6i_flags);
2781
			if (err)
J
Jakub Kicinski 已提交
2782
				goto out_unlock;
2783
		}
2784

2785 2786 2787 2788 2789 2790
		err = skb_tunnel_check_pmtu(skb, ndst, VXLAN6_HEADROOM,
					    netif_is_any_bridge_port(dev));
		if (err < 0) {
			goto tx_error;
		} else if (err) {
			if (info) {
2791
				struct ip_tunnel_info *unclone;
2792 2793
				struct in6_addr src, dst;

2794 2795 2796 2797
				unclone = skb_tunnel_info_unclone(skb);
				if (unlikely(!unclone))
					goto tx_error;

2798 2799
				src = remote_ip.sin6.sin6_addr;
				dst = local_ip.sin6.sin6_addr;
2800 2801
				unclone->key.u.ipv6.src = src;
				unclone->key.u.ipv6.dst = dst;
2802 2803 2804 2805 2806 2807
			}

			vxlan_encap_bypass(skb, vxlan, vxlan, vni, false);
			dst_release(ndst);
			goto out_unlock;
		}
X
Xin Long 已提交
2808

2809
		tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
C
Cong Wang 已提交
2810
		ttl = ttl ? : ip6_dst_hoplimit(ndst);
2811 2812
		skb_scrub_packet(skb, xnet);
		err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
2813
				      vni, md, flags, udp_sum);
P
pravin shelar 已提交
2814 2815 2816
		if (err < 0)
			goto tx_error;

P
pravin shelar 已提交
2817
		udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
2818
				     &local_ip.sin6.sin6_addr,
2819
				     &dst->sin6.sin6_addr, tos, ttl,
2820
				     label, src_port, dst_port, !udp_sum);
C
Cong Wang 已提交
2821 2822
#endif
	}
J
Jakub Kicinski 已提交
2823 2824
out_unlock:
	rcu_read_unlock();
2825
	return;
S
stephen hemminger 已提交
2826 2827 2828

drop:
	dev->stats.tx_dropped++;
P
pravin shelar 已提交
2829 2830
	dev_kfree_skb(skb);
	return;
S
stephen hemminger 已提交
2831 2832

tx_error:
J
Jakub Kicinski 已提交
2833
	rcu_read_unlock();
2834 2835 2836 2837
	if (err == -ELOOP)
		dev->stats.collisions++;
	else if (err == -ENETUNREACH)
		dev->stats.tx_carrier_errors++;
P
pravin shelar 已提交
2838
	dst_release(ndst);
S
stephen hemminger 已提交
2839
	dev->stats.tx_errors++;
P
pravin shelar 已提交
2840
	kfree_skb(skb);
S
stephen hemminger 已提交
2841 2842
}

2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874
static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
			  struct vxlan_fdb *f, __be32 vni, bool did_rsc)
{
	struct vxlan_rdst nh_rdst;
	struct nexthop *nh;
	bool do_xmit;
	u32 hash;

	memset(&nh_rdst, 0, sizeof(struct vxlan_rdst));
	hash = skb_get_hash(skb);

	rcu_read_lock();
	nh = rcu_dereference(f->nh);
	if (!nh) {
		rcu_read_unlock();
		goto drop;
	}
	do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst);
	rcu_read_unlock();

	if (likely(do_xmit))
		vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc);
	else
		goto drop;

	return;

drop:
	dev->stats.tx_dropped++;
	dev_kfree_skb(skb);
}

2875 2876 2877 2878 2879 2880 2881 2882 2883
/* Transmit local packets over Vxlan
 *
 * Outer IP header inherits ECN and DF from inner header.
 * Outer UDP destination is the VXLAN assigned port.
 *           source port is based on hash of flow
 */
static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
2884
	struct vxlan_rdst *rdst, *fdst = NULL;
2885
	const struct ip_tunnel_info *info;
2886 2887
	bool did_rsc = false;
	struct vxlan_fdb *f;
2888
	struct ethhdr *eth;
2889
	__be32 vni = 0;
2890

2891
	info = skb_tunnel_info(skb);
2892

2893 2894
	skb_reset_mac_header(skb);

2895
	if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) {
2896 2897 2898 2899 2900 2901 2902 2903 2904 2905
		if (info && info->mode & IP_TUNNEL_INFO_BRIDGE &&
		    info->mode & IP_TUNNEL_INFO_TX) {
			vni = tunnel_id_to_key32(info->key.tun_id);
		} else {
			if (info && info->mode & IP_TUNNEL_INFO_TX)
				vxlan_xmit_one(skb, dev, vni, NULL, false);
			else
				kfree_skb(skb);
			return NETDEV_TX_OK;
		}
2906 2907
	}

2908
	if (vxlan->cfg.flags & VXLAN_F_PROXY) {
2909
		eth = eth_hdr(skb);
C
Cong Wang 已提交
2910
		if (ntohs(eth->h_proto) == ETH_P_ARP)
2911
			return arp_reduce(dev, skb, vni);
C
Cong Wang 已提交
2912
#if IS_ENABLED(CONFIG_IPV6)
2913 2914 2915 2916 2917 2918 2919 2920
		else if (ntohs(eth->h_proto) == ETH_P_IPV6 &&
			 pskb_may_pull(skb, sizeof(struct ipv6hdr) +
					    sizeof(struct nd_msg)) &&
			 ipv6_hdr(skb)->nexthdr == IPPROTO_ICMPV6) {
			struct nd_msg *m = (struct nd_msg *)(ipv6_hdr(skb) + 1);

			if (m->icmph.icmp6_code == 0 &&
			    m->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION)
2921
				return neigh_reduce(dev, skb, vni);
C
Cong Wang 已提交
2922 2923 2924
		}
#endif
	}
2925

2926
	eth = eth_hdr(skb);
2927
	f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2928 2929
	did_rsc = false;

2930
	if (f && (f->flags & NTF_ROUTER) && (vxlan->cfg.flags & VXLAN_F_RSC) &&
2931 2932
	    (ntohs(eth->h_proto) == ETH_P_IP ||
	     ntohs(eth->h_proto) == ETH_P_IPV6)) {
2933 2934
		did_rsc = route_shortcircuit(dev, skb);
		if (did_rsc)
2935
			f = vxlan_find_mac(vxlan, eth->h_dest, vni);
2936 2937
	}

2938
	if (f == NULL) {
2939
		f = vxlan_find_mac(vxlan, all_zeros_mac, vni);
2940
		if (f == NULL) {
2941
			if ((vxlan->cfg.flags & VXLAN_F_L2MISS) &&
2942 2943 2944 2945
			    !is_multicast_ether_addr(eth->h_dest))
				vxlan_fdb_miss(vxlan, eth->h_dest);

			dev->stats.tx_dropped++;
2946
			kfree_skb(skb);
2947 2948 2949
			return NETDEV_TX_OK;
		}
	}
2950

2951 2952 2953 2954 2955 2956
	if (rcu_access_pointer(f->nh)) {
		vxlan_xmit_nh(skb, dev, f,
			      (vni ? : vxlan->default_dst.remote_vni), did_rsc);
	} else {
		list_for_each_entry_rcu(rdst, &f->remotes, list) {
			struct sk_buff *skb1;
2957

2958 2959 2960 2961 2962 2963 2964
			if (!fdst) {
				fdst = rdst;
				continue;
			}
			skb1 = skb_clone(skb, GFP_ATOMIC);
			if (skb1)
				vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc);
2965
		}
2966 2967 2968 2969
		if (fdst)
			vxlan_xmit_one(skb, dev, vni, fdst, did_rsc);
		else
			kfree_skb(skb);
2970 2971
	}

2972
	return NETDEV_TX_OK;
2973 2974
}

S
stephen hemminger 已提交
2975
/* Walk the forwarding table and purge stale entries */
2976
static void vxlan_cleanup(struct timer_list *t)
S
stephen hemminger 已提交
2977
{
2978
	struct vxlan_dev *vxlan = from_timer(vxlan, t, age_timer);
S
stephen hemminger 已提交
2979 2980 2981 2982 2983 2984 2985 2986
	unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
	unsigned int h;

	if (!netif_running(vxlan->dev))
		return;

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
2987

2988
		spin_lock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
2989 2990 2991 2992 2993
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
			unsigned long timeout;

2994
			if (f->state & (NUD_PERMANENT | NUD_NOARP))
S
stephen hemminger 已提交
2995 2996
				continue;

2997 2998 2999
			if (f->flags & NTF_EXT_LEARNED)
				continue;

3000
			timeout = f->used + vxlan->cfg.age_interval * HZ;
S
stephen hemminger 已提交
3001 3002 3003 3004 3005
			if (time_before_eq(timeout, jiffies)) {
				netdev_dbg(vxlan->dev,
					   "garbage collect %pM\n",
					   f->eth_addr);
				f->state = NUD_STALE;
3006
				vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
3007 3008 3009
			} else if (time_before(timeout, next_timer))
				next_timer = timeout;
		}
3010
		spin_unlock(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3011 3012 3013 3014 3015
	}

	mod_timer(&vxlan->age_timer, next_timer);
}

3016 3017 3018 3019 3020
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan)
{
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);

	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
3021 3022 3023 3024
	hlist_del_init_rcu(&vxlan->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
	hlist_del_init_rcu(&vxlan->hlist6.hlist);
#endif
3025 3026 3027
	spin_unlock(&vn->sock_lock);
}

J
Jiri Benc 已提交
3028 3029
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan,
			     struct vxlan_dev_node *node)
3030
{
3031
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3032
	__be32 vni = vxlan->default_dst.remote_vni;
3033

J
Jiri Benc 已提交
3034
	node->vxlan = vxlan;
3035
	spin_lock(&vn->sock_lock);
J
Jiri Benc 已提交
3036
	hlist_add_head_rcu(&node->hlist, vni_head(vs, vni));
3037
	spin_unlock(&vn->sock_lock);
3038 3039
}

S
stephen hemminger 已提交
3040 3041 3042
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
3043 3044 3045
	struct vxlan_dev *vxlan = netdev_priv(dev);
	int err;

3046
	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
3047
	if (!dev->tstats)
S
stephen hemminger 已提交
3048 3049
		return -ENOMEM;

3050 3051 3052 3053 3054 3055
	err = gro_cells_init(&vxlan->gro_cells, dev);
	if (err) {
		free_percpu(dev->tstats);
		return err;
	}

S
stephen hemminger 已提交
3056 3057 3058
	return 0;
}

3059
static void vxlan_fdb_delete_default(struct vxlan_dev *vxlan, __be32 vni)
3060 3061
{
	struct vxlan_fdb *f;
3062
	u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
3063

3064
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
3065
	f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
3066
	if (f)
3067
		vxlan_fdb_destroy(vxlan, f, true, true);
3068
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
3069 3070
}

3071 3072 3073 3074
static void vxlan_uninit(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

3075 3076
	gro_cells_destroy(&vxlan->gro_cells);

3077
	vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
3078

3079 3080 3081
	free_percpu(dev->tstats);
}

S
stephen hemminger 已提交
3082 3083 3084 3085
/* Start ageing timer and join group when device is brought up */
static int vxlan_open(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3086
	int ret;
3087

3088 3089 3090
	ret = vxlan_sock_add(vxlan);
	if (ret < 0)
		return ret;
S
stephen hemminger 已提交
3091

3092
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
3093
		ret = vxlan_igmp_join(vxlan);
3094 3095
		if (ret == -EADDRINUSE)
			ret = 0;
3096
		if (ret) {
3097
			vxlan_sock_release(vxlan);
3098 3099
			return ret;
		}
S
stephen hemminger 已提交
3100 3101
	}

3102
	if (vxlan->cfg.age_interval)
S
stephen hemminger 已提交
3103 3104
		mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);

3105
	return ret;
S
stephen hemminger 已提交
3106 3107 3108
}

/* Purge the forwarding table */
3109
static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
S
stephen hemminger 已提交
3110
{
3111
	unsigned int h;
S
stephen hemminger 已提交
3112 3113 3114

	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		struct hlist_node *p, *n;
3115 3116

		spin_lock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3117 3118 3119
		hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
			struct vxlan_fdb *f
				= container_of(p, struct vxlan_fdb, hlist);
3120 3121
			if (!do_all && (f->state & (NUD_PERMANENT | NUD_NOARP)))
				continue;
3122
			/* the all_zeros_mac entry is deleted at vxlan_uninit */
T
Taehee Yoo 已提交
3123 3124 3125 3126
			if (is_zero_ether_addr(f->eth_addr) &&
			    f->vni == vxlan->cfg.vni)
				continue;
			vxlan_fdb_destroy(vxlan, f, true, true);
S
stephen hemminger 已提交
3127
		}
3128
		spin_unlock_bh(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3129 3130 3131 3132 3133 3134 3135
	}
}

/* Cleanup timer and forwarding table on shutdown */
static int vxlan_stop(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
N
Nicolas Dichtel 已提交
3136
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
3137
	int ret = 0;
S
stephen hemminger 已提交
3138

3139
	if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
3140
	    !vxlan_group_used(vn, vxlan))
3141
		ret = vxlan_igmp_leave(vxlan);
S
stephen hemminger 已提交
3142 3143 3144

	del_timer_sync(&vxlan->age_timer);

3145
	vxlan_flush(vxlan, false);
3146
	vxlan_sock_release(vxlan);
S
stephen hemminger 已提交
3147

3148
	return ret;
S
stephen hemminger 已提交
3149 3150 3151 3152 3153 3154 3155
}

/* Stub, nothing needs to be done. */
static void vxlan_set_multicast_list(struct net_device *dev)
{
}

3156
static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
3157
{
3158 3159 3160 3161
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);
3162
	bool use_ipv6 = !!(vxlan->cfg.flags & VXLAN_F_IPV6);
3163

3164 3165 3166 3167 3168 3169 3170
	/* This check is different than dev->max_mtu, because it looks at
	 * the lowerdev->mtu, rather than the static dev->max_mtu
	 */
	if (lowerdev) {
		int max_mtu = lowerdev->mtu -
			      (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
		if (new_mtu > max_mtu)
D
David Wragg 已提交
3171 3172 3173
			return -EINVAL;
	}

3174 3175 3176 3177
	dev->mtu = new_mtu;
	return 0;
}

3178 3179 3180 3181 3182 3183 3184 3185 3186 3187
static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct ip_tunnel_info *info = skb_tunnel_info(skb);
	__be16 sport, dport;

	sport = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
				  vxlan->cfg.port_max, true);
	dport = info->key.tp_dst ? : vxlan->cfg.dst_port;

3188
	if (ip_tunnel_info_af(info) == AF_INET) {
3189
		struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
3190 3191
		struct rtable *rt;

3192
		rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
3193
				     info->key.u.ipv4.dst,
3194 3195
				     &info->key.u.ipv4.src, dport, sport,
				     &info->dst_cache, info);
3196 3197 3198
		if (IS_ERR(rt))
			return PTR_ERR(rt);
		ip_rt_put(rt);
3199 3200
	} else {
#if IS_ENABLED(CONFIG_IPV6)
3201
		struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
3202 3203
		struct dst_entry *ndst;

3204
		ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
3205
					info->key.label, &info->key.u.ipv6.dst,
3206 3207
					&info->key.u.ipv6.src, dport, sport,
					&info->dst_cache, info);
3208 3209 3210 3211 3212 3213 3214
		if (IS_ERR(ndst))
			return PTR_ERR(ndst);
		dst_release(ndst);
#else /* !CONFIG_IPV6 */
		return -EPFNOSUPPORT;
#endif
	}
3215 3216
	info->key.tp_src = sport;
	info->key.tp_dst = dport;
3217
	return 0;
3218 3219
}

3220
static const struct net_device_ops vxlan_netdev_ether_ops = {
S
stephen hemminger 已提交
3221
	.ndo_init		= vxlan_init,
3222
	.ndo_uninit		= vxlan_uninit,
S
stephen hemminger 已提交
3223 3224 3225
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
H
Heiner Kallweit 已提交
3226
	.ndo_get_stats64	= dev_get_tstats64,
S
stephen hemminger 已提交
3227
	.ndo_set_rx_mode	= vxlan_set_multicast_list,
3228
	.ndo_change_mtu		= vxlan_change_mtu,
S
stephen hemminger 已提交
3229 3230 3231 3232 3233
	.ndo_validate_addr	= eth_validate_addr,
	.ndo_set_mac_address	= eth_mac_addr,
	.ndo_fdb_add		= vxlan_fdb_add,
	.ndo_fdb_del		= vxlan_fdb_delete,
	.ndo_fdb_dump		= vxlan_fdb_dump,
R
Roopa Prabhu 已提交
3234
	.ndo_fdb_get		= vxlan_fdb_get,
3235
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
3236
	.ndo_change_proto_down  = dev_change_proto_down_generic,
S
stephen hemminger 已提交
3237 3238
};

J
Jiri Benc 已提交
3239 3240 3241 3242 3243 3244
static const struct net_device_ops vxlan_netdev_raw_ops = {
	.ndo_init		= vxlan_init,
	.ndo_uninit		= vxlan_uninit,
	.ndo_open		= vxlan_open,
	.ndo_stop		= vxlan_stop,
	.ndo_start_xmit		= vxlan_xmit,
H
Heiner Kallweit 已提交
3245
	.ndo_get_stats64	= dev_get_tstats64,
J
Jiri Benc 已提交
3246 3247 3248 3249
	.ndo_change_mtu		= vxlan_change_mtu,
	.ndo_fill_metadata_dst	= vxlan_fill_metadata_dst,
};

S
stephen hemminger 已提交
3250 3251 3252 3253 3254
/* Info for udev, that this is a virtual tunnel endpoint */
static struct device_type vxlan_type = {
	.name = "vxlan",
};

3255
/* Calls the ndo_udp_tunnel_add of the caller in order to
J
Joseph Gasparakis 已提交
3256
 * supply the listening VXLAN udp ports. Callers are expected
3257
 * to implement the ndo_udp_tunnel_add.
3258
 */
3259
static void vxlan_offload_rx_ports(struct net_device *dev, bool push)
3260 3261 3262 3263
{
	struct vxlan_sock *vs;
	struct net *net = dev_net(dev);
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
J
Joseph Gasparakis 已提交
3264
	unsigned int i;
3265 3266 3267

	spin_lock(&vn->sock_lock);
	for (i = 0; i < PORT_HASH_SIZE; ++i) {
3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280
		hlist_for_each_entry_rcu(vs, &vn->sock_list[i], hlist) {
			unsigned short type;

			if (vs->flags & VXLAN_F_GPE)
				type = UDP_TUNNEL_TYPE_VXLAN_GPE;
			else
				type = UDP_TUNNEL_TYPE_VXLAN;

			if (push)
				udp_tunnel_push_rx_port(dev, vs->sock, type);
			else
				udp_tunnel_drop_rx_port(dev, vs->sock, type);
		}
3281 3282 3283 3284
	}
	spin_unlock(&vn->sock_lock);
}

S
stephen hemminger 已提交
3285 3286 3287 3288
/* Initialize the device structure. */
static void vxlan_setup(struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
3289
	unsigned int h;
S
stephen hemminger 已提交
3290

3291 3292 3293
	eth_hw_addr_random(dev);
	ether_setup(dev);

3294
	dev->needs_free_netdev = true;
S
stephen hemminger 已提交
3295 3296 3297
	SET_NETDEV_DEVTYPE(dev, &vxlan_type);

	dev->features	|= NETIF_F_LLTX;
3298
	dev->features	|= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
3299
	dev->features   |= NETIF_F_RXCSUM;
3300
	dev->features   |= NETIF_F_GSO_SOFTWARE;
3301

3302
	dev->vlan_features = dev->features;
3303 3304
	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
	dev->hw_features |= NETIF_F_RXCSUM;
3305
	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
3306
	netif_keep_dst(dev);
3307
	dev->priv_flags |= IFF_NO_QUEUE;
S
stephen hemminger 已提交
3308

3309 3310 3311 3312
	/* MTU range: 68 - 65535 */
	dev->min_mtu = ETH_MIN_MTU;
	dev->max_mtu = ETH_MAX_MTU;

3313
	INIT_LIST_HEAD(&vxlan->next);
S
stephen hemminger 已提交
3314

3315
	timer_setup(&vxlan->age_timer, vxlan_cleanup, TIMER_DEFERRABLE);
S
stephen hemminger 已提交
3316 3317 3318

	vxlan->dev = dev;

3319 3320
	for (h = 0; h < FDB_HASH_SIZE; ++h) {
		spin_lock_init(&vxlan->hash_lock[h]);
S
stephen hemminger 已提交
3321
		INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
3322
	}
S
stephen hemminger 已提交
3323 3324
}

3325 3326 3327 3328 3329 3330 3331
static void vxlan_ether_setup(struct net_device *dev)
{
	dev->priv_flags &= ~IFF_TX_SKB_SHARING;
	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
	dev->netdev_ops = &vxlan_netdev_ether_ops;
}

J
Jiri Benc 已提交
3332 3333
static void vxlan_raw_setup(struct net_device *dev)
{
3334
	dev->header_ops = NULL;
J
Jiri Benc 已提交
3335 3336 3337 3338 3339 3340 3341
	dev->type = ARPHRD_NONE;
	dev->hard_header_len = 0;
	dev->addr_len = 0;
	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
	dev->netdev_ops = &vxlan_netdev_raw_ops;
}

S
stephen hemminger 已提交
3342 3343
static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
	[IFLA_VXLAN_ID]		= { .type = NLA_U32 },
3344
	[IFLA_VXLAN_GROUP]	= { .len = sizeof_field(struct iphdr, daddr) },
C
Cong Wang 已提交
3345
	[IFLA_VXLAN_GROUP6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3346
	[IFLA_VXLAN_LINK]	= { .type = NLA_U32 },
3347
	[IFLA_VXLAN_LOCAL]	= { .len = sizeof_field(struct iphdr, saddr) },
C
Cong Wang 已提交
3348
	[IFLA_VXLAN_LOCAL6]	= { .len = sizeof(struct in6_addr) },
S
stephen hemminger 已提交
3349 3350
	[IFLA_VXLAN_TOS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_TTL]	= { .type = NLA_U8 },
3351
	[IFLA_VXLAN_LABEL]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
3352 3353 3354
	[IFLA_VXLAN_LEARNING]	= { .type = NLA_U8 },
	[IFLA_VXLAN_AGEING]	= { .type = NLA_U32 },
	[IFLA_VXLAN_LIMIT]	= { .type = NLA_U32 },
3355
	[IFLA_VXLAN_PORT_RANGE] = { .len  = sizeof(struct ifla_vxlan_port_range) },
D
David Stevens 已提交
3356 3357 3358 3359
	[IFLA_VXLAN_PROXY]	= { .type = NLA_U8 },
	[IFLA_VXLAN_RSC]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L2MISS]	= { .type = NLA_U8 },
	[IFLA_VXLAN_L3MISS]	= { .type = NLA_U8 },
3360
	[IFLA_VXLAN_COLLECT_METADATA]	= { .type = NLA_U8 },
3361
	[IFLA_VXLAN_PORT]	= { .type = NLA_U16 },
3362 3363 3364
	[IFLA_VXLAN_UDP_CSUM]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]	= { .type = NLA_U8 },
T
Tom Herbert 已提交
3365 3366
	[IFLA_VXLAN_REMCSUM_TX]	= { .type = NLA_U8 },
	[IFLA_VXLAN_REMCSUM_RX]	= { .type = NLA_U8 },
T
Thomas Graf 已提交
3367
	[IFLA_VXLAN_GBP]	= { .type = NLA_FLAG, },
J
Jiri Benc 已提交
3368
	[IFLA_VXLAN_GPE]	= { .type = NLA_FLAG, },
3369
	[IFLA_VXLAN_REMCSUM_NOPARTIAL]	= { .type = NLA_FLAG },
H
Hangbin Liu 已提交
3370
	[IFLA_VXLAN_TTL_INHERIT]	= { .type = NLA_FLAG },
3371
	[IFLA_VXLAN_DF]		= { .type = NLA_U8 },
S
stephen hemminger 已提交
3372 3373
};

3374 3375
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
			  struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3376 3377 3378
{
	if (tb[IFLA_ADDRESS]) {
		if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
3379 3380
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided link layer address is not Ethernet");
S
stephen hemminger 已提交
3381 3382 3383 3384
			return -EINVAL;
		}

		if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
3385 3386
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_ADDRESS],
					    "Provided Ethernet address is not unicast");
S
stephen hemminger 已提交
3387 3388 3389 3390
			return -EADDRNOTAVAIL;
		}
	}

3391
	if (tb[IFLA_MTU]) {
3392
		u32 mtu = nla_get_u32(tb[IFLA_MTU]);
3393

3394 3395 3396
		if (mtu < ETH_MIN_MTU || mtu > ETH_MAX_MTU) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "MTU must be between 68 and 65535");
3397
			return -EINVAL;
3398
		}
3399 3400
	}

3401 3402 3403
	if (!data) {
		NL_SET_ERR_MSG(extack,
			       "Required attributes not provided to perform the operation");
S
stephen hemminger 已提交
3404
		return -EINVAL;
3405
	}
S
stephen hemminger 已提交
3406 3407

	if (data[IFLA_VXLAN_ID]) {
3408 3409
		u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);

3410
		if (id >= VXLAN_N_VID) {
3411
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_ID],
3412
					    "VXLAN ID must be lower than 16777216");
S
stephen hemminger 已提交
3413
			return -ERANGE;
3414
		}
S
stephen hemminger 已提交
3415 3416
	}

3417 3418 3419 3420 3421
	if (data[IFLA_VXLAN_PORT_RANGE]) {
		const struct ifla_vxlan_port_range *p
			= nla_data(data[IFLA_VXLAN_PORT_RANGE]);

		if (ntohs(p->high) < ntohs(p->low)) {
3422
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_PORT_RANGE],
3423
					    "Invalid source port range");
3424 3425 3426 3427
			return -EINVAL;
		}
	}

3428 3429 3430 3431
	if (data[IFLA_VXLAN_DF]) {
		enum ifla_vxlan_df df = nla_get_u8(data[IFLA_VXLAN_DF]);

		if (df < 0 || df > VXLAN_DF_MAX) {
3432
			NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_DF],
3433 3434 3435 3436 3437
					    "Invalid DF attribute");
			return -EINVAL;
		}
	}

S
stephen hemminger 已提交
3438 3439 3440
	return 0;
}

Y
Yan Burman 已提交
3441 3442 3443 3444 3445 3446 3447
static void vxlan_get_drvinfo(struct net_device *netdev,
			      struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
	strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
}

3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466
static int vxlan_get_link_ksettings(struct net_device *dev,
				    struct ethtool_link_ksettings *cmd)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
							 dst->remote_ifindex);

	if (!lowerdev) {
		cmd->base.duplex = DUPLEX_UNKNOWN;
		cmd->base.port = PORT_OTHER;
		cmd->base.speed = SPEED_UNKNOWN;

		return 0;
	}

	return __ethtool_get_link_ksettings(lowerdev, cmd);
}

Y
Yan Burman 已提交
3467
static const struct ethtool_ops vxlan_ethtool_ops = {
3468 3469 3470
	.get_drvinfo		= vxlan_get_drvinfo,
	.get_link		= ethtool_op_get_link,
	.get_link_ksettings	= vxlan_get_link_ksettings,
Y
Yan Burman 已提交
3471 3472
};

T
Tom Herbert 已提交
3473
static struct socket *vxlan_create_sock(struct net *net, bool ipv6,
3474
					__be16 port, u32 flags, int ifindex)
3475
{
C
Cong Wang 已提交
3476
	struct socket *sock;
T
Tom Herbert 已提交
3477 3478
	struct udp_port_cfg udp_conf;
	int err;
C
Cong Wang 已提交
3479

T
Tom Herbert 已提交
3480
	memset(&udp_conf, 0, sizeof(udp_conf));
C
Cong Wang 已提交
3481

T
Tom Herbert 已提交
3482 3483 3484
	if (ipv6) {
		udp_conf.family = AF_INET6;
		udp_conf.use_udp6_rx_checksums =
3485
		    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
3486
		udp_conf.ipv6_v6only = 1;
T
Tom Herbert 已提交
3487 3488
	} else {
		udp_conf.family = AF_INET;
C
Cong Wang 已提交
3489 3490
	}

T
Tom Herbert 已提交
3491
	udp_conf.local_udp_port = port;
3492
	udp_conf.bind_ifindex = ifindex;
3493

T
Tom Herbert 已提交
3494 3495 3496 3497
	/* Open UDP socket */
	err = udp_sock_create(net, &udp_conf, &sock);
	if (err < 0)
		return ERR_PTR(err);
C
Cong Wang 已提交
3498

P
Paolo Abeni 已提交
3499
	udp_allow_gso(sock->sk);
Z
Zhi Yong Wu 已提交
3500
	return sock;
C
Cong Wang 已提交
3501 3502 3503
}

/* Create new listen socket if needed */
3504
static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
3505 3506
					      __be16 port, u32 flags,
					      int ifindex)
C
Cong Wang 已提交
3507 3508 3509 3510 3511
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_sock *vs;
	struct socket *sock;
	unsigned int h;
3512
	struct udp_tunnel_sock_cfg tunnel_cfg;
C
Cong Wang 已提交
3513

3514
	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
C
Cong Wang 已提交
3515 3516 3517 3518 3519 3520
	if (!vs)
		return ERR_PTR(-ENOMEM);

	for (h = 0; h < VNI_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vs->vni_list[h]);

3521
	sock = vxlan_create_sock(net, ipv6, port, flags, ifindex);
Z
Zhi Yong Wu 已提交
3522
	if (IS_ERR(sock)) {
3523
		kfree(vs);
3524
		return ERR_CAST(sock);
3525
	}
C
Cong Wang 已提交
3526 3527

	vs->sock = sock;
3528
	refcount_set(&vs->refcnt, 1);
3529
	vs->flags = (flags & VXLAN_F_RCV_FLAGS);
3530

3531 3532
	spin_lock(&vn->sock_lock);
	hlist_add_head_rcu(&vs->hlist, vs_head(net, port));
3533
	udp_tunnel_notify_add_rx_port(sock,
3534 3535
				      (vs->flags & VXLAN_F_GPE) ?
				      UDP_TUNNEL_TYPE_VXLAN_GPE :
3536
				      UDP_TUNNEL_TYPE_VXLAN);
3537
	spin_unlock(&vn->sock_lock);
3538 3539

	/* Mark socket as an encapsulation socket. */
3540
	memset(&tunnel_cfg, 0, sizeof(tunnel_cfg));
3541 3542
	tunnel_cfg.sk_user_data = vs;
	tunnel_cfg.encap_type = 1;
3543
	tunnel_cfg.encap_rcv = vxlan_rcv;
S
Stefano Brivio 已提交
3544
	tunnel_cfg.encap_err_lookup = vxlan_err_lookup;
3545
	tunnel_cfg.encap_destroy = NULL;
3546 3547
	tunnel_cfg.gro_receive = vxlan_gro_receive;
	tunnel_cfg.gro_complete = vxlan_gro_complete;
3548 3549

	setup_udp_tunnel_sock(net, sock, &tunnel_cfg);
C
Cong Wang 已提交
3550

3551 3552 3553
	return vs;
}

3554
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
3555
{
3556 3557
	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
	struct vxlan_sock *vs = NULL;
J
Jiri Benc 已提交
3558
	struct vxlan_dev_node *node;
3559 3560 3561 3562 3563
	int l3mdev_index = 0;

	if (vxlan->cfg.remote_ifindex)
		l3mdev_index = l3mdev_master_upper_ifindex_by_index(
			vxlan->net, vxlan->cfg.remote_ifindex);
3564

3565
	if (!vxlan->cfg.no_share) {
3566
		spin_lock(&vn->sock_lock);
3567
		vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
3568 3569
				     vxlan->cfg.dst_port, vxlan->cfg.flags,
				     l3mdev_index);
3570
		if (vs && !refcount_inc_not_zero(&vs->refcnt)) {
3571
			spin_unlock(&vn->sock_lock);
3572
			return -EBUSY;
3573 3574 3575
		}
		spin_unlock(&vn->sock_lock);
	}
3576
	if (!vs)
3577
		vs = vxlan_socket_create(vxlan->net, ipv6,
3578 3579
					 vxlan->cfg.dst_port, vxlan->cfg.flags,
					 l3mdev_index);
3580 3581
	if (IS_ERR(vs))
		return PTR_ERR(vs);
3582
#if IS_ENABLED(CONFIG_IPV6)
J
Jiri Benc 已提交
3583
	if (ipv6) {
3584
		rcu_assign_pointer(vxlan->vn6_sock, vs);
J
Jiri Benc 已提交
3585 3586
		node = &vxlan->hlist6;
	} else
3587
#endif
J
Jiri Benc 已提交
3588
	{
3589
		rcu_assign_pointer(vxlan->vn4_sock, vs);
J
Jiri Benc 已提交
3590 3591 3592
		node = &vxlan->hlist4;
	}
	vxlan_vs_add_dev(vs, vxlan, node);
3593
	return 0;
3594 3595
}

3596 3597
static int vxlan_sock_add(struct vxlan_dev *vxlan)
{
3598 3599
	bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
	bool ipv6 = vxlan->cfg.flags & VXLAN_F_IPV6 || metadata;
3600
	bool ipv4 = !ipv6 || metadata;
3601 3602
	int ret = 0;

3603
	RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
3604
#if IS_ENABLED(CONFIG_IPV6)
3605
	RCU_INIT_POINTER(vxlan->vn6_sock, NULL);
3606
	if (ipv6) {
3607
		ret = __vxlan_sock_add(vxlan, true);
3608 3609 3610
		if (ret < 0 && ret != -EAFNOSUPPORT)
			ipv4 = false;
	}
3611
#endif
3612
	if (ipv4)
3613 3614 3615 3616 3617 3618
		ret = __vxlan_sock_add(vxlan, false);
	if (ret < 0)
		vxlan_sock_release(vxlan);
	return ret;
}

3619 3620
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
				 struct net_device **lower,
3621 3622
				 struct vxlan_dev *old,
				 struct netlink_ext_ack *extack)
S
stephen hemminger 已提交
3623
{
3624
	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
3625
	struct vxlan_dev *tmp;
C
Cong Wang 已提交
3626
	bool use_ipv6 = false;
S
stephen hemminger 已提交
3627

3628 3629 3630 3631 3632 3633 3634 3635
	if (conf->flags & VXLAN_F_GPE) {
		/* For now, allow GPE only together with
		 * COLLECT_METADATA. This can be relaxed later; in such
		 * case, the other side of the PtP link will have to be
		 * provided.
		 */
		if ((conf->flags & ~VXLAN_F_ALLOWED_GPE) ||
		    !(conf->flags & VXLAN_F_COLLECT_METADATA)) {
3636 3637
			NL_SET_ERR_MSG(extack,
				       "VXLAN GPE does not support this combination of attributes");
3638
			return -EINVAL;
3639
		}
J
Jiri Benc 已提交
3640
	}
3641

3642 3643
	if (!conf->remote_ip.sa.sa_family && !conf->saddr.sa.sa_family) {
		/* Unless IPv6 is explicitly requested, assume IPv4 */
3644
		conf->remote_ip.sa.sa_family = AF_INET;
3645 3646 3647 3648 3649 3650 3651
		conf->saddr.sa.sa_family = AF_INET;
	} else if (!conf->remote_ip.sa.sa_family) {
		conf->remote_ip.sa.sa_family = conf->saddr.sa.sa_family;
	} else if (!conf->saddr.sa.sa_family) {
		conf->saddr.sa.sa_family = conf->remote_ip.sa.sa_family;
	}

3652 3653 3654
	if (conf->saddr.sa.sa_family != conf->remote_ip.sa.sa_family) {
		NL_SET_ERR_MSG(extack,
			       "Local and remote address must be from the same family");
3655
		return -EINVAL;
3656
	}
C
Cong Wang 已提交
3657

3658 3659
	if (vxlan_addr_multicast(&conf->saddr)) {
		NL_SET_ERR_MSG(extack, "Local address cannot be multicast");
3660
		return -EINVAL;
3661
	}
3662

3663
	if (conf->saddr.sa.sa_family == AF_INET6) {
3664 3665 3666
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG(extack,
				       "IPv6 support not enabled in the kernel");
3667
			return -EPFNOSUPPORT;
3668
		}
C
Cong Wang 已提交
3669
		use_ipv6 = true;
3670
		conf->flags |= VXLAN_F_IPV6;
3671 3672 3673 3674 3675 3676 3677 3678 3679

		if (!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
			int local_type =
				ipv6_addr_type(&conf->saddr.sin6.sin6_addr);
			int remote_type =
				ipv6_addr_type(&conf->remote_ip.sin6.sin6_addr);

			if (local_type & IPV6_ADDR_LINKLOCAL) {
				if (!(remote_type & IPV6_ADDR_LINKLOCAL) &&
3680 3681 3682
				    (remote_type != IPV6_ADDR_ANY)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3683
					return -EINVAL;
3684
				}
3685 3686 3687 3688

				conf->flags |= VXLAN_F_IPV6_LINKLOCAL;
			} else {
				if (remote_type ==
3689 3690 3691
				    (IPV6_ADDR_UNICAST | IPV6_ADDR_LINKLOCAL)) {
					NL_SET_ERR_MSG(extack,
						       "Invalid combination of local and remote address scopes");
3692
					return -EINVAL;
3693
				}
3694 3695 3696 3697

				conf->flags &= ~VXLAN_F_IPV6_LINKLOCAL;
			}
		}
3698
	}
S
stephen hemminger 已提交
3699

3700 3701 3702
	if (conf->label && !use_ipv6) {
		NL_SET_ERR_MSG(extack,
			       "Label attribute only applies to IPv6 VXLAN devices");
3703
		return -EINVAL;
3704
	}
3705

3706 3707
	if (conf->remote_ifindex) {
		struct net_device *lowerdev;
3708

3709
		lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
3710 3711 3712
		if (!lowerdev) {
			NL_SET_ERR_MSG(extack,
				       "Invalid local interface, device not found");
3713
			return -ENODEV;
3714
		}
S
stephen hemminger 已提交
3715

C
Cong Wang 已提交
3716 3717 3718
#if IS_ENABLED(CONFIG_IPV6)
		if (use_ipv6) {
			struct inet6_dev *idev = __in6_dev_get(lowerdev);
3719

3720 3721 3722
			if (idev && idev->cnf.disable_ipv6) {
				NL_SET_ERR_MSG(extack,
					       "IPv6 support disabled by administrator");
C
Cong Wang 已提交
3723
				return -EPERM;
3724
			}
C
Cong Wang 已提交
3725 3726 3727
		}
#endif

3728 3729
		*lower = lowerdev;
	} else {
3730 3731 3732 3733
		if (vxlan_addr_multicast(&conf->remote_ip)) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for multicast remote destination");

3734
			return -EINVAL;
3735
		}
3736

3737
#if IS_ENABLED(CONFIG_IPV6)
3738 3739 3740
		if (conf->flags & VXLAN_F_IPV6_LINKLOCAL) {
			NL_SET_ERR_MSG(extack,
				       "Local interface required for link-local local/remote addresses");
3741
			return -EINVAL;
3742
		}
3743 3744
#endif

3745
		*lower = NULL;
J
Jiri Benc 已提交
3746
	}
S
stephen hemminger 已提交
3747

3748 3749 3750 3751 3752
	if (!conf->dst_port) {
		if (conf->flags & VXLAN_F_GPE)
			conf->dst_port = htons(4790); /* IANA VXLAN-GPE port */
		else
			conf->dst_port = htons(vxlan_port);
3753 3754
	}

3755 3756
	if (!conf->age_interval)
		conf->age_interval = FDB_AGE_DEFAULT;
3757

3758 3759 3760
	list_for_each_entry(tmp, &vn->vxlan_list, next) {
		if (tmp == old)
			continue;
3761

3762 3763 3764 3765 3766
		if (tmp->cfg.vni != conf->vni)
			continue;
		if (tmp->cfg.dst_port != conf->dst_port)
			continue;
		if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
3767
		    (conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
3768 3769 3770 3771 3772 3773
			continue;

		if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
		    tmp->cfg.remote_ifindex != conf->remote_ifindex)
			continue;

3774 3775
		NL_SET_ERR_MSG(extack,
			       "A VXLAN device with the specified VNI already exists");
3776
		return -EEXIST;
3777
	}
3778

3779 3780 3781 3782 3783
	return 0;
}

static void vxlan_config_apply(struct net_device *dev,
			       struct vxlan_config *conf,
3784 3785 3786
			       struct net_device *lowerdev,
			       struct net *src_net,
			       bool changelink)
3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *dst = &vxlan->default_dst;
	unsigned short needed_headroom = ETH_HLEN;
	bool use_ipv6 = !!(conf->flags & VXLAN_F_IPV6);
	int max_mtu = ETH_MAX_MTU;

	if (!changelink) {
		if (conf->flags & VXLAN_F_GPE)
			vxlan_raw_setup(dev);
		else
			vxlan_ether_setup(dev);

		if (conf->mtu)
			dev->mtu = conf->mtu;
3802 3803

		vxlan->net = src_net;
3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814
	}

	dst->remote_vni = conf->vni;

	memcpy(&dst->remote_ip, &conf->remote_ip, sizeof(conf->remote_ip));

	if (lowerdev) {
		dst->remote_ifindex = conf->remote_ifindex;

		dev->gso_max_size = lowerdev->gso_max_size;
		dev->gso_max_segs = lowerdev->gso_max_segs;
3815

3816
		needed_headroom = lowerdev->hard_header_len;
3817
		needed_headroom += lowerdev->needed_headroom;
3818

3819
		dev->needed_tailroom = lowerdev->needed_tailroom;
3820

3821 3822
		max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
					   VXLAN_HEADROOM);
3823 3824 3825 3826 3827
		if (max_mtu < ETH_MIN_MTU)
			max_mtu = ETH_MIN_MTU;

		if (!changelink && !conf->mtu)
			dev->mtu = max_mtu;
3828 3829
	}

3830 3831 3832
	if (dev->mtu > max_mtu)
		dev->mtu = max_mtu;

3833 3834 3835 3836 3837 3838
	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
		needed_headroom += VXLAN6_HEADROOM;
	else
		needed_headroom += VXLAN_HEADROOM;
	dev->needed_headroom = needed_headroom;

3839
	memcpy(&vxlan->cfg, conf, sizeof(*conf));
3840
}
3841

3842
static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
3843 3844
			       struct vxlan_config *conf, bool changelink,
			       struct netlink_ext_ack *extack)
3845 3846 3847 3848
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct net_device *lowerdev;
	int ret;
3849

3850
	ret = vxlan_config_validate(src_net, conf, &lowerdev, vxlan, extack);
3851 3852
	if (ret)
		return ret;
R
Roopa Prabhu 已提交
3853

3854
	vxlan_config_apply(dev, conf, lowerdev, src_net, changelink);
3855 3856 3857 3858

	return 0;
}

N
Nicolas Dichtel 已提交
3859
static int __vxlan_dev_create(struct net *net, struct net_device *dev,
3860 3861
			      struct vxlan_config *conf,
			      struct netlink_ext_ack *extack)
N
Nicolas Dichtel 已提交
3862 3863 3864
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan = netdev_priv(dev);
3865
	struct net_device *remote_dev = NULL;
3866
	struct vxlan_fdb *f = NULL;
3867
	bool unregister = false;
3868
	struct vxlan_rdst *dst;
N
Nicolas Dichtel 已提交
3869 3870
	int err;

3871
	dst = &vxlan->default_dst;
3872
	err = vxlan_dev_configure(net, dev, conf, false, extack);
N
Nicolas Dichtel 已提交
3873 3874 3875 3876 3877 3878
	if (err)
		return err;

	dev->ethtool_ops = &vxlan_ethtool_ops;

	/* create an fdb entry for a valid default destination */
3879
	if (!vxlan_addr_any(&dst->remote_ip)) {
3880
		err = vxlan_fdb_create(vxlan, all_zeros_mac,
3881
				       &dst->remote_ip,
N
Nicolas Dichtel 已提交
3882 3883
				       NUD_REACHABLE | NUD_PERMANENT,
				       vxlan->cfg.dst_port,
3884 3885 3886
				       dst->remote_vni,
				       dst->remote_vni,
				       dst->remote_ifindex,
3887
				       NTF_SELF, 0, &f, extack);
N
Nicolas Dichtel 已提交
3888 3889 3890 3891 3892
		if (err)
			return err;
	}

	err = register_netdevice(dev);
3893 3894
	if (err)
		goto errout;
3895
	unregister = true;
3896

3897 3898
	if (dst->remote_ifindex) {
		remote_dev = __dev_get_by_index(net, dst->remote_ifindex);
3899 3900
		if (!remote_dev) {
			err = -ENODEV;
3901
			goto errout;
3902
		}
3903 3904 3905 3906 3907 3908

		err = netdev_upper_dev_link(remote_dev, dev, extack);
		if (err)
			goto errout;
	}

3909
	err = rtnl_configure_link(dev, NULL);
3910
	if (err < 0)
3911
		goto unlink;
N
Nicolas Dichtel 已提交
3912

3913
	if (f) {
3914
		vxlan_fdb_insert(vxlan, all_zeros_mac, dst->remote_vni, f);
3915 3916

		/* notify default fdb entry */
3917
		err = vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f),
3918
				       RTM_NEWNEIGH, true, extack);
3919 3920
		if (err) {
			vxlan_fdb_destroy(vxlan, f, false, false);
3921 3922
			if (remote_dev)
				netdev_upper_dev_unlink(remote_dev, dev);
3923 3924
			goto unregister;
		}
3925
	}
3926

N
Nicolas Dichtel 已提交
3927
	list_add(&vxlan->next, &vn->vxlan_list);
3928 3929
	if (remote_dev)
		dst->remote_dev = remote_dev;
N
Nicolas Dichtel 已提交
3930
	return 0;
3931 3932 3933
unlink:
	if (remote_dev)
		netdev_upper_dev_unlink(remote_dev, dev);
3934
errout:
3935 3936 3937 3938
	/* unregister_netdevice() destroys the default FDB entry with deletion
	 * notification. But the addition notification was not sent yet, so
	 * destroy the entry by hand here.
	 */
3939
	if (f)
3940 3941
		__vxlan_fdb_free(f);
unregister:
3942 3943
	if (unregister)
		unregister_netdevice(dev);
3944
	return err;
N
Nicolas Dichtel 已提交
3945 3946
}

3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974
/* Set/clear flags based on attribute */
static int vxlan_nl2flag(struct vxlan_config *conf, struct nlattr *tb[],
			  int attrtype, unsigned long mask, bool changelink,
			  bool changelink_supported,
			  struct netlink_ext_ack *extack)
{
	unsigned long flags;

	if (!tb[attrtype])
		return 0;

	if (changelink && !changelink_supported) {
		vxlan_flag_attr_error(attrtype, extack);
		return -EOPNOTSUPP;
	}

	if (vxlan_policy[attrtype].type == NLA_FLAG)
		flags = conf->flags | mask;
	else if (nla_get_u8(tb[attrtype]))
		flags = conf->flags | mask;
	else
		flags = conf->flags & ~mask;

	conf->flags = flags;

	return 0;
}

R
Roopa Prabhu 已提交
3975 3976
static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
			 struct net_device *dev, struct vxlan_config *conf,
3977
			 bool changelink, struct netlink_ext_ack *extack)
3978
{
R
Roopa Prabhu 已提交
3979
	struct vxlan_dev *vxlan = netdev_priv(dev);
3980
	int err = 0;
3981

R
Roopa Prabhu 已提交
3982
	memset(conf, 0, sizeof(*conf));
3983

R
Roopa Prabhu 已提交
3984 3985 3986 3987 3988 3989 3990
	/* if changelink operation, start with old existing cfg */
	if (changelink)
		memcpy(conf, &vxlan->cfg, sizeof(*conf));

	if (data[IFLA_VXLAN_ID]) {
		__be32 vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));

3991 3992
		if (changelink && (vni != conf->vni)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_ID], "Cannot change VNI");
R
Roopa Prabhu 已提交
3993
			return -EOPNOTSUPP;
3994
		}
R
Roopa Prabhu 已提交
3995 3996
		conf->vni = cpu_to_be32(nla_get_u32(data[IFLA_VXLAN_ID]));
	}
3997 3998

	if (data[IFLA_VXLAN_GROUP]) {
3999 4000
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP], "New group address family does not match old group");
4001
			return -EOPNOTSUPP;
4002
		}
4003

R
Roopa Prabhu 已提交
4004
		conf->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
4005
		conf->remote_ip.sa.sa_family = AF_INET;
4006
	} else if (data[IFLA_VXLAN_GROUP6]) {
4007 4008
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "IPv6 support not enabled in the kernel");
4009
			return -EPFNOSUPPORT;
4010
		}
4011

4012 4013
		if (changelink && (conf->remote_ip.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_GROUP6], "New group address family does not match old group");
4014
			return -EOPNOTSUPP;
4015
		}
4016

R
Roopa Prabhu 已提交
4017 4018
		conf->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
		conf->remote_ip.sa.sa_family = AF_INET6;
4019 4020 4021
	}

	if (data[IFLA_VXLAN_LOCAL]) {
4022 4023
		if (changelink && (conf->saddr.sa.sa_family != AF_INET)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL], "New local address family does not match old");
4024
			return -EOPNOTSUPP;
4025
		}
4026

R
Roopa Prabhu 已提交
4027 4028
		conf->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
		conf->saddr.sa.sa_family = AF_INET;
4029
	} else if (data[IFLA_VXLAN_LOCAL6]) {
4030 4031
		if (!IS_ENABLED(CONFIG_IPV6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "IPv6 support not enabled in the kernel");
4032
			return -EPFNOSUPPORT;
4033
		}
4034

4035 4036
		if (changelink && (conf->saddr.sa.sa_family != AF_INET6)) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LOCAL6], "New local address family does not match old");
4037
			return -EOPNOTSUPP;
4038
		}
4039

4040
		/* TODO: respect scope id */
R
Roopa Prabhu 已提交
4041 4042
		conf->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
		conf->saddr.sa.sa_family = AF_INET6;
4043 4044 4045
	}

	if (data[IFLA_VXLAN_LINK])
R
Roopa Prabhu 已提交
4046
		conf->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]);
4047

S
stephen hemminger 已提交
4048
	if (data[IFLA_VXLAN_TOS])
R
Roopa Prabhu 已提交
4049
		conf->tos  = nla_get_u8(data[IFLA_VXLAN_TOS]);
S
stephen hemminger 已提交
4050

4051
	if (data[IFLA_VXLAN_TTL])
R
Roopa Prabhu 已提交
4052
		conf->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
4053

H
Hangbin Liu 已提交
4054
	if (data[IFLA_VXLAN_TTL_INHERIT]) {
4055 4056 4057 4058 4059 4060
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_TTL_INHERIT,
				    VXLAN_F_TTL_INHERIT, changelink, false,
				    extack);
		if (err)
			return err;

H
Hangbin Liu 已提交
4061 4062
	}

4063
	if (data[IFLA_VXLAN_LABEL])
R
Roopa Prabhu 已提交
4064
		conf->label = nla_get_be32(data[IFLA_VXLAN_LABEL]) &
4065 4066
			     IPV6_FLOWLABEL_MASK;

R
Roopa Prabhu 已提交
4067
	if (data[IFLA_VXLAN_LEARNING]) {
4068 4069 4070 4071 4072
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_LEARNING,
				    VXLAN_F_LEARN, changelink, true,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4073 4074 4075 4076
	} else if (!changelink) {
		/* default to learn on a new device */
		conf->flags |= VXLAN_F_LEARN;
	}
S
stephen hemminger 已提交
4077

I
Ido Schimmel 已提交
4078
	if (data[IFLA_VXLAN_AGEING])
R
Roopa Prabhu 已提交
4079
		conf->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
S
stephen hemminger 已提交
4080

R
Roopa Prabhu 已提交
4081
	if (data[IFLA_VXLAN_PROXY]) {
4082 4083 4084 4085 4086
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_PROXY,
				    VXLAN_F_PROXY, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4087
	}
D
David Stevens 已提交
4088

R
Roopa Prabhu 已提交
4089
	if (data[IFLA_VXLAN_RSC]) {
4090 4091 4092 4093 4094
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_RSC,
				    VXLAN_F_RSC, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4095
	}
D
David Stevens 已提交
4096

R
Roopa Prabhu 已提交
4097
	if (data[IFLA_VXLAN_L2MISS]) {
4098 4099 4100 4101 4102
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L2MISS,
				    VXLAN_F_L2MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4103
	}
D
David Stevens 已提交
4104

R
Roopa Prabhu 已提交
4105
	if (data[IFLA_VXLAN_L3MISS]) {
4106 4107 4108 4109 4110
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_L3MISS,
				    VXLAN_F_L3MISS, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4111
	}
D
David Stevens 已提交
4112

R
Roopa Prabhu 已提交
4113
	if (data[IFLA_VXLAN_LIMIT]) {
4114 4115 4116
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_LIMIT],
					    "Cannot change limit");
R
Roopa Prabhu 已提交
4117
			return -EOPNOTSUPP;
4118
		}
R
Roopa Prabhu 已提交
4119 4120
		conf->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
	}
S
stephen hemminger 已提交
4121

R
Roopa Prabhu 已提交
4122
	if (data[IFLA_VXLAN_COLLECT_METADATA]) {
4123 4124 4125 4126 4127
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_COLLECT_METADATA,
				    VXLAN_F_COLLECT_METADATA, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4128
	}
4129

4130
	if (data[IFLA_VXLAN_PORT_RANGE]) {
R
Roopa Prabhu 已提交
4131 4132 4133 4134 4135 4136
		if (!changelink) {
			const struct ifla_vxlan_port_range *p
				= nla_data(data[IFLA_VXLAN_PORT_RANGE]);
			conf->port_min = ntohs(p->low);
			conf->port_max = ntohs(p->high);
		} else {
4137 4138
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT_RANGE],
					    "Cannot change port range");
R
Roopa Prabhu 已提交
4139 4140
			return -EOPNOTSUPP;
		}
4141 4142
	}

R
Roopa Prabhu 已提交
4143
	if (data[IFLA_VXLAN_PORT]) {
4144 4145 4146
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_PORT],
					    "Cannot change port");
R
Roopa Prabhu 已提交
4147
			return -EOPNOTSUPP;
4148
		}
R
Roopa Prabhu 已提交
4149 4150
		conf->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
	}
4151

R
Roopa Prabhu 已提交
4152
	if (data[IFLA_VXLAN_UDP_CSUM]) {
4153 4154 4155
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_VXLAN_UDP_CSUM],
					    "Cannot change UDP_CSUM flag");
R
Roopa Prabhu 已提交
4156
			return -EOPNOTSUPP;
4157
		}
R
Roopa Prabhu 已提交
4158 4159 4160
		if (!nla_get_u8(data[IFLA_VXLAN_UDP_CSUM]))
			conf->flags |= VXLAN_F_UDP_ZERO_CSUM_TX;
	}
4161

R
Roopa Prabhu 已提交
4162
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_TX]) {
4163 4164 4165 4166 4167
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
				    VXLAN_F_UDP_ZERO_CSUM6_TX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4168
	}
4169

R
Roopa Prabhu 已提交
4170
	if (data[IFLA_VXLAN_UDP_ZERO_CSUM6_RX]) {
4171 4172 4173 4174 4175
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
				    VXLAN_F_UDP_ZERO_CSUM6_RX, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4176
	}
4177

R
Roopa Prabhu 已提交
4178
	if (data[IFLA_VXLAN_REMCSUM_TX]) {
4179 4180 4181 4182 4183
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_TX,
				    VXLAN_F_REMCSUM_TX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4184
	}
T
Tom Herbert 已提交
4185

R
Roopa Prabhu 已提交
4186
	if (data[IFLA_VXLAN_REMCSUM_RX]) {
4187 4188 4189 4190 4191
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_RX,
				    VXLAN_F_REMCSUM_RX, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4192 4193 4194
	}

	if (data[IFLA_VXLAN_GBP]) {
4195 4196 4197 4198
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GBP,
				    VXLAN_F_GBP, changelink, false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4199 4200 4201
	}

	if (data[IFLA_VXLAN_GPE]) {
4202 4203 4204 4205 4206
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_GPE,
				    VXLAN_F_GPE, changelink, false,
				    extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4207 4208 4209
	}

	if (data[IFLA_VXLAN_REMCSUM_NOPARTIAL]) {
4210 4211 4212 4213 4214
		err = vxlan_nl2flag(conf, data, IFLA_VXLAN_REMCSUM_NOPARTIAL,
				    VXLAN_F_REMCSUM_NOPARTIAL, changelink,
				    false, extack);
		if (err)
			return err;
R
Roopa Prabhu 已提交
4215 4216 4217
	}

	if (tb[IFLA_MTU]) {
4218 4219 4220
		if (changelink) {
			NL_SET_ERR_MSG_ATTR(extack, tb[IFLA_MTU],
					    "Cannot change mtu");
R
Roopa Prabhu 已提交
4221
			return -EOPNOTSUPP;
4222
		}
R
Roopa Prabhu 已提交
4223 4224 4225
		conf->mtu = nla_get_u32(tb[IFLA_MTU]);
	}

4226 4227 4228
	if (data[IFLA_VXLAN_DF])
		conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);

R
Roopa Prabhu 已提交
4229 4230 4231 4232
	return 0;
}

static int vxlan_newlink(struct net *src_net, struct net_device *dev,
4233 4234
			 struct nlattr *tb[], struct nlattr *data[],
			 struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4235 4236 4237 4238
{
	struct vxlan_config conf;
	int err;

4239
	err = vxlan_nl2conf(tb, data, dev, &conf, false, extack);
R
Roopa Prabhu 已提交
4240 4241 4242
	if (err)
		return err;

4243
	return __vxlan_dev_create(src_net, dev, &conf, extack);
R
Roopa Prabhu 已提交
4244
}
T
Tom Herbert 已提交
4245

R
Roopa Prabhu 已提交
4246
static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
4247 4248
			    struct nlattr *data[],
			    struct netlink_ext_ack *extack)
R
Roopa Prabhu 已提交
4249 4250
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4251
	struct net_device *lowerdev;
R
Roopa Prabhu 已提交
4252
	struct vxlan_config conf;
4253
	struct vxlan_rdst *dst;
R
Roopa Prabhu 已提交
4254 4255
	int err;

4256
	dst = &vxlan->default_dst;
4257
	err = vxlan_nl2conf(tb, data, dev, &conf, true, extack);
R
Roopa Prabhu 已提交
4258 4259
	if (err)
		return err;
T
Thomas Graf 已提交
4260

4261 4262
	err = vxlan_config_validate(vxlan->net, &conf, &lowerdev,
				    vxlan, extack);
R
Roopa Prabhu 已提交
4263 4264
	if (err)
		return err;
4265

4266 4267 4268
	if (dst->remote_dev == lowerdev)
		lowerdev = NULL;

4269 4270 4271 4272 4273
	err = netdev_adjacent_change_prepare(dst->remote_dev, lowerdev, dev,
					     extack);
	if (err)
		return err;

R
Roopa Prabhu 已提交
4274
	/* handle default dst entry */
4275
	if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) {
4276 4277 4278
		u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni);

		spin_lock_bh(&vxlan->hash_lock[hash_index]);
4279
		if (!vxlan_addr_any(&conf.remote_ip)) {
4280
			err = vxlan_fdb_update(vxlan, all_zeros_mac,
4281
					       &conf.remote_ip,
R
Roopa Prabhu 已提交
4282
					       NUD_REACHABLE | NUD_PERMANENT,
4283
					       NLM_F_APPEND | NLM_F_CREATE,
R
Roopa Prabhu 已提交
4284
					       vxlan->cfg.dst_port,
4285 4286
					       conf.vni, conf.vni,
					       conf.remote_ifindex,
4287
					       NTF_SELF, 0, true, extack);
R
Roopa Prabhu 已提交
4288
			if (err) {
4289
				spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4290 4291
				netdev_adjacent_change_abort(dst->remote_dev,
							     lowerdev, dev);
R
Roopa Prabhu 已提交
4292 4293 4294
				return err;
			}
		}
4295 4296 4297 4298 4299 4300 4301 4302
		if (!vxlan_addr_any(&dst->remote_ip))
			__vxlan_fdb_delete(vxlan, all_zeros_mac,
					   dst->remote_ip,
					   vxlan->cfg.dst_port,
					   dst->remote_vni,
					   dst->remote_vni,
					   dst->remote_ifindex,
					   true);
4303
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
R
Roopa Prabhu 已提交
4304
	}
4305

4306 4307 4308
	if (conf.age_interval != vxlan->cfg.age_interval)
		mod_timer(&vxlan->age_timer, jiffies);

4309
	netdev_adjacent_change_commit(dst->remote_dev, lowerdev, dev);
4310
	if (lowerdev && lowerdev != dst->remote_dev)
4311
		dst->remote_dev = lowerdev;
4312
	vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true);
R
Roopa Prabhu 已提交
4313
	return 0;
S
stephen hemminger 已提交
4314 4315 4316 4317 4318 4319
}

static void vxlan_dellink(struct net_device *dev, struct list_head *head)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

4320 4321
	vxlan_flush(vxlan, true);

4322
	list_del(&vxlan->next);
S
stephen hemminger 已提交
4323
	unregister_netdevice_queue(dev, head);
4324 4325
	if (vxlan->default_dst.remote_dev)
		netdev_upper_dev_unlink(vxlan->default_dst.remote_dev, dev);
S
stephen hemminger 已提交
4326 4327 4328 4329 4330 4331
}

static size_t vxlan_get_size(const struct net_device *dev)
{

	return nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_ID */
C
Cong Wang 已提交
4332
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_GROUP{6} */
S
stephen hemminger 已提交
4333
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LINK */
C
Cong Wang 已提交
4334
		nla_total_size(sizeof(struct in6_addr)) + /* IFLA_VXLAN_LOCAL{6} */
S
stephen hemminger 已提交
4335
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL */
H
Hangbin Liu 已提交
4336
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TTL_INHERIT */
S
stephen hemminger 已提交
4337
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_TOS */
4338
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_DF */
4339
		nla_total_size(sizeof(__be32)) + /* IFLA_VXLAN_LABEL */
S
stephen hemminger 已提交
4340
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_LEARNING */
D
David Stevens 已提交
4341 4342 4343 4344
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_PROXY */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_RSC */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L2MISS */
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_L3MISS */
4345
		nla_total_size(sizeof(__u8)) +	/* IFLA_VXLAN_COLLECT_METADATA */
S
stephen hemminger 已提交
4346 4347
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_AGEING */
		nla_total_size(sizeof(__u32)) +	/* IFLA_VXLAN_LIMIT */
4348
		nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
4349 4350 4351 4352
		nla_total_size(sizeof(__be16)) + /* IFLA_VXLAN_PORT */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_CSUM */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_UDP_ZERO_CSUM6_RX */
T
Tom Herbert 已提交
4353 4354
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_TX */
		nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_REMCSUM_RX */
S
stephen hemminger 已提交
4355 4356 4357 4358 4359 4360
		0;
}

static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
{
	const struct vxlan_dev *vxlan = netdev_priv(dev);
4361
	const struct vxlan_rdst *dst = &vxlan->default_dst;
4362
	struct ifla_vxlan_port_range ports = {
4363 4364
		.low =  htons(vxlan->cfg.port_min),
		.high = htons(vxlan->cfg.port_max),
4365
	};
S
stephen hemminger 已提交
4366

4367
	if (nla_put_u32(skb, IFLA_VXLAN_ID, be32_to_cpu(dst->remote_vni)))
S
stephen hemminger 已提交
4368 4369
		goto nla_put_failure;

C
Cong Wang 已提交
4370 4371
	if (!vxlan_addr_any(&dst->remote_ip)) {
		if (dst->remote_ip.sa.sa_family == AF_INET) {
4372 4373
			if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
					    dst->remote_ip.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4374 4375 4376
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4377 4378
			if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
					     &dst->remote_ip.sin6.sin6_addr))
C
Cong Wang 已提交
4379 4380 4381 4382
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4383

4384
	if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
S
stephen hemminger 已提交
4385 4386
		goto nla_put_failure;

4387 4388
	if (!vxlan_addr_any(&vxlan->cfg.saddr)) {
		if (vxlan->cfg.saddr.sa.sa_family == AF_INET) {
4389
			if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
4390
					    vxlan->cfg.saddr.sin.sin_addr.s_addr))
C
Cong Wang 已提交
4391 4392 4393
				goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
		} else {
4394
			if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
4395
					     &vxlan->cfg.saddr.sin6.sin6_addr))
C
Cong Wang 已提交
4396 4397 4398 4399
				goto nla_put_failure;
#endif
		}
	}
S
stephen hemminger 已提交
4400

4401
	if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->cfg.ttl) ||
H
Hangbin Liu 已提交
4402 4403
	    nla_put_u8(skb, IFLA_VXLAN_TTL_INHERIT,
		       !!(vxlan->cfg.flags & VXLAN_F_TTL_INHERIT)) ||
4404
	    nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->cfg.tos) ||
4405
	    nla_put_u8(skb, IFLA_VXLAN_DF, vxlan->cfg.df) ||
4406
	    nla_put_be32(skb, IFLA_VXLAN_LABEL, vxlan->cfg.label) ||
D
David Stevens 已提交
4407
	    nla_put_u8(skb, IFLA_VXLAN_LEARNING,
4408
		       !!(vxlan->cfg.flags & VXLAN_F_LEARN)) ||
D
David Stevens 已提交
4409
	    nla_put_u8(skb, IFLA_VXLAN_PROXY,
4410
		       !!(vxlan->cfg.flags & VXLAN_F_PROXY)) ||
4411 4412
	    nla_put_u8(skb, IFLA_VXLAN_RSC,
		       !!(vxlan->cfg.flags & VXLAN_F_RSC)) ||
D
David Stevens 已提交
4413
	    nla_put_u8(skb, IFLA_VXLAN_L2MISS,
4414
		       !!(vxlan->cfg.flags & VXLAN_F_L2MISS)) ||
D
David Stevens 已提交
4415
	    nla_put_u8(skb, IFLA_VXLAN_L3MISS,
4416
		       !!(vxlan->cfg.flags & VXLAN_F_L3MISS)) ||
4417
	    nla_put_u8(skb, IFLA_VXLAN_COLLECT_METADATA,
4418
		       !!(vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)) ||
4419 4420 4421
	    nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->cfg.age_interval) ||
	    nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->cfg.addrmax) ||
	    nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->cfg.dst_port) ||
4422
	    nla_put_u8(skb, IFLA_VXLAN_UDP_CSUM,
4423
		       !(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM_TX)) ||
4424
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_TX,
4425
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_TX)) ||
4426
	    nla_put_u8(skb, IFLA_VXLAN_UDP_ZERO_CSUM6_RX,
4427
		       !!(vxlan->cfg.flags & VXLAN_F_UDP_ZERO_CSUM6_RX)) ||
T
Tom Herbert 已提交
4428
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_TX,
4429
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_TX)) ||
T
Tom Herbert 已提交
4430
	    nla_put_u8(skb, IFLA_VXLAN_REMCSUM_RX,
4431
		       !!(vxlan->cfg.flags & VXLAN_F_REMCSUM_RX)))
S
stephen hemminger 已提交
4432 4433
		goto nla_put_failure;

4434 4435 4436
	if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
		goto nla_put_failure;

4437
	if (vxlan->cfg.flags & VXLAN_F_GBP &&
T
Thomas Graf 已提交
4438 4439 4440
	    nla_put_flag(skb, IFLA_VXLAN_GBP))
		goto nla_put_failure;

4441
	if (vxlan->cfg.flags & VXLAN_F_GPE &&
J
Jiri Benc 已提交
4442 4443 4444
	    nla_put_flag(skb, IFLA_VXLAN_GPE))
		goto nla_put_failure;

4445
	if (vxlan->cfg.flags & VXLAN_F_REMCSUM_NOPARTIAL &&
4446 4447 4448
	    nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
		goto nla_put_failure;

S
stephen hemminger 已提交
4449 4450 4451 4452 4453 4454
	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4455 4456 4457 4458 4459 4460 4461
static struct net *vxlan_get_link_net(const struct net_device *dev)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);

	return vxlan->net;
}

S
stephen hemminger 已提交
4462 4463 4464 4465 4466 4467 4468 4469
static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
	.kind		= "vxlan",
	.maxtype	= IFLA_VXLAN_MAX,
	.policy		= vxlan_policy,
	.priv_size	= sizeof(struct vxlan_dev),
	.setup		= vxlan_setup,
	.validate	= vxlan_validate,
	.newlink	= vxlan_newlink,
R
Roopa Prabhu 已提交
4470
	.changelink	= vxlan_changelink,
S
stephen hemminger 已提交
4471 4472 4473
	.dellink	= vxlan_dellink,
	.get_size	= vxlan_get_size,
	.fill_info	= vxlan_fill_info,
4474
	.get_link_net	= vxlan_get_link_net,
S
stephen hemminger 已提交
4475 4476
};

4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487
struct net_device *vxlan_dev_create(struct net *net, const char *name,
				    u8 name_assign_type,
				    struct vxlan_config *conf)
{
	struct nlattr *tb[IFLA_MAX + 1];
	struct net_device *dev;
	int err;

	memset(&tb, 0, sizeof(tb));

	dev = rtnl_create_link(net, name, name_assign_type,
4488
			       &vxlan_link_ops, tb, NULL);
4489 4490 4491
	if (IS_ERR(dev))
		return dev;

4492
	err = __vxlan_dev_create(net, dev, conf, NULL);
4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510
	if (err < 0) {
		free_netdev(dev);
		return ERR_PTR(err);
	}

	err = rtnl_configure_link(dev, NULL);
	if (err < 0) {
		LIST_HEAD(list_kill);

		vxlan_dellink(dev, &list_kill);
		unregister_netdevice_many(&list_kill);
		return ERR_PTR(err);
	}

	return dev;
}
EXPORT_SYMBOL_GPL(vxlan_dev_create);

4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532
static void vxlan_handle_lowerdev_unregister(struct vxlan_net *vn,
					     struct net_device *dev)
{
	struct vxlan_dev *vxlan, *next;
	LIST_HEAD(list_kill);

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		struct vxlan_rdst *dst = &vxlan->default_dst;

		/* In case we created vxlan device with carrier
		 * and we loose the carrier due to module unload
		 * we also need to remove vxlan device. In other
		 * cases, it's not necessary and remote_ifindex
		 * is 0 here, so no matches.
		 */
		if (dst->remote_ifindex == dev->ifindex)
			vxlan_dellink(vxlan->dev, &list_kill);
	}

	unregister_netdevice_many(&list_kill);
}

4533 4534
static int vxlan_netdevice_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
4535 4536
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4537
	struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
4538

4539
	if (event == NETDEV_UNREGISTER)
4540
		vxlan_handle_lowerdev_unregister(vn, dev);
4541 4542 4543 4544
	else if (event == NETDEV_UDP_TUNNEL_PUSH_INFO)
		vxlan_offload_rx_ports(dev, true);
	else if (event == NETDEV_UDP_TUNNEL_DROP_INFO)
		vxlan_offload_rx_ports(dev, false);
4545 4546 4547 4548 4549

	return NOTIFY_DONE;
}

static struct notifier_block vxlan_notifier_block __read_mostly = {
4550
	.notifier_call = vxlan_netdevice_event,
4551 4552
};

4553 4554 4555 4556 4557 4558 4559
static void
vxlan_fdb_offloaded_set(struct net_device *dev,
			struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_rdst *rdst;
	struct vxlan_fdb *f;
4560 4561 4562
	u32 hash_index;

	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4563

4564
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		goto out;

	rdst = vxlan_fdb_find_rdst(f, &fdb_info->remote_ip,
				   fdb_info->remote_port,
				   fdb_info->remote_vni,
				   fdb_info->remote_ifindex);
	if (!rdst)
		goto out;

	rdst->offloaded = fdb_info->offloaded;

out:
4580
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
4581 4582
}

P
Petr Machata 已提交
4583 4584 4585 4586 4587
static int
vxlan_fdb_external_learn_add(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
4588
	struct netlink_ext_ack *extack;
4589
	u32 hash_index;
P
Petr Machata 已提交
4590 4591
	int err;

4592
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
4593 4594
	extack = switchdev_notifier_info_to_extack(&fdb_info->info);

4595
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4596 4597 4598 4599 4600 4601 4602 4603
	err = vxlan_fdb_update(vxlan, fdb_info->eth_addr, &fdb_info->remote_ip,
			       NUD_REACHABLE,
			       NLM_F_CREATE | NLM_F_REPLACE,
			       fdb_info->remote_port,
			       fdb_info->vni,
			       fdb_info->remote_vni,
			       fdb_info->remote_ifindex,
			       NTF_USE | NTF_SELF | NTF_EXT_LEARNED,
4604
			       0, false, extack);
4605
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4606 4607 4608 4609 4610 4611 4612 4613 4614 4615

	return err;
}

static int
vxlan_fdb_external_learn_del(struct net_device *dev,
			     struct switchdev_notifier_vxlan_fdb_info *fdb_info)
{
	struct vxlan_dev *vxlan = netdev_priv(dev);
	struct vxlan_fdb *f;
4616
	u32 hash_index;
P
Petr Machata 已提交
4617 4618
	int err = 0;

4619 4620
	hash_index = fdb_head_index(vxlan, fdb_info->eth_addr, fdb_info->vni);
	spin_lock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633

	f = vxlan_find_mac(vxlan, fdb_info->eth_addr, fdb_info->vni);
	if (!f)
		err = -ENOENT;
	else if (f->flags & NTF_EXT_LEARNED)
		err = __vxlan_fdb_delete(vxlan, fdb_info->eth_addr,
					 fdb_info->remote_ip,
					 fdb_info->remote_port,
					 fdb_info->vni,
					 fdb_info->remote_vni,
					 fdb_info->remote_ifindex,
					 false);

4634
	spin_unlock_bh(&vxlan->hash_lock[hash_index]);
P
Petr Machata 已提交
4635 4636 4637 4638

	return err;
}

4639 4640 4641 4642
static int vxlan_switchdev_event(struct notifier_block *unused,
				 unsigned long event, void *ptr)
{
	struct net_device *dev = switchdev_notifier_info_to_dev(ptr);
P
Petr Machata 已提交
4643 4644
	struct switchdev_notifier_vxlan_fdb_info *fdb_info;
	int err = 0;
4645 4646 4647 4648 4649

	switch (event) {
	case SWITCHDEV_VXLAN_FDB_OFFLOADED:
		vxlan_fdb_offloaded_set(dev, ptr);
		break;
P
Petr Machata 已提交
4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669
	case SWITCHDEV_VXLAN_FDB_ADD_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_add(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = true;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
	case SWITCHDEV_VXLAN_FDB_DEL_TO_BRIDGE:
		fdb_info = ptr;
		err = vxlan_fdb_external_learn_del(dev, fdb_info);
		if (err) {
			err = notifier_from_errno(err);
			break;
		}
		fdb_info->offloaded = false;
		vxlan_fdb_offloaded_set(dev, fdb_info);
		break;
4670 4671
	}

P
Petr Machata 已提交
4672
	return err;
4673 4674 4675 4676 4677 4678
}

static struct notifier_block vxlan_switchdev_notifier_block __read_mostly = {
	.notifier_call = vxlan_switchdev_event,
};

4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698
static void vxlan_fdb_nh_flush(struct nexthop *nh)
{
	struct vxlan_fdb *fdb;
	struct vxlan_dev *vxlan;
	u32 hash_index;

	rcu_read_lock();
	list_for_each_entry_rcu(fdb, &nh->fdb_list, nh_list) {
		vxlan = rcu_dereference(fdb->vdev);
		WARN_ON(!vxlan);
		hash_index = fdb_head_index(vxlan, fdb->eth_addr,
					    vxlan->default_dst.remote_vni);
		spin_lock_bh(&vxlan->hash_lock[hash_index]);
		if (!hlist_unhashed(&fdb->hlist))
			vxlan_fdb_destroy(vxlan, fdb, false, false);
		spin_unlock_bh(&vxlan->hash_lock[hash_index]);
	}
	rcu_read_unlock();
}

4699 4700 4701
static int vxlan_nexthop_event(struct notifier_block *nb,
			       unsigned long event, void *ptr)
{
4702 4703 4704 4705 4706
	struct nh_notifier_info *info = ptr;
	struct nexthop *nh;

	if (event != NEXTHOP_EVENT_DEL)
		return NOTIFY_DONE;
4707

4708 4709
	nh = nexthop_find_by_id(info->net, info->id);
	if (!nh)
4710 4711
		return NOTIFY_DONE;

4712
	vxlan_fdb_nh_flush(nh);
4713 4714 4715 4716

	return NOTIFY_DONE;
}

S
stephen hemminger 已提交
4717 4718 4719
static __net_init int vxlan_init_net(struct net *net)
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
4720
	unsigned int h;
S
stephen hemminger 已提交
4721

4722
	INIT_LIST_HEAD(&vn->vxlan_list);
4723
	spin_lock_init(&vn->sock_lock);
4724
	vn->nexthop_notifier_block.notifier_call = vxlan_nexthop_event;
S
stephen hemminger 已提交
4725

4726 4727
	for (h = 0; h < PORT_HASH_SIZE; ++h)
		INIT_HLIST_HEAD(&vn->sock_list[h]);
S
stephen hemminger 已提交
4728

4729 4730
	return register_nexthop_notifier(net, &vn->nexthop_notifier_block,
					 NULL);
S
stephen hemminger 已提交
4731 4732
}

4733
static void vxlan_destroy_tunnels(struct net *net, struct list_head *head)
N
Nicolas Dichtel 已提交
4734 4735 4736 4737 4738 4739 4740
{
	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
	struct vxlan_dev *vxlan, *next;
	struct net_device *dev, *aux;

	for_each_netdev_safe(net, dev, aux)
		if (dev->rtnl_link_ops == &vxlan_link_ops)
4741
			unregister_netdevice_queue(dev, head);
N
Nicolas Dichtel 已提交
4742 4743 4744 4745 4746

	list_for_each_entry_safe(vxlan, next, &vn->vxlan_list, next) {
		/* If vxlan->dev is in the same netns, it has already been added
		 * to the list by the previous loop.
		 */
4747
		if (!net_eq(dev_net(vxlan->dev), net))
4748
			unregister_netdevice_queue(vxlan->dev, head);
N
Nicolas Dichtel 已提交
4749 4750 4751 4752
	}

}

4753 4754 4755 4756
static void __net_exit vxlan_exit_batch_net(struct list_head *net_list)
{
	struct net *net;
	LIST_HEAD(list);
4757
	unsigned int h;
4758 4759

	rtnl_lock();
4760 4761 4762 4763 4764
	list_for_each_entry(net, net_list, exit_list) {
		struct vxlan_net *vn = net_generic(net, vxlan_net_id);

		unregister_nexthop_notifier(net, &vn->nexthop_notifier_block);
	}
4765 4766 4767 4768 4769
	list_for_each_entry(net, net_list, exit_list)
		vxlan_destroy_tunnels(net, &list);

	unregister_netdevice_many(&list);
	rtnl_unlock();
4770 4771 4772 4773 4774 4775 4776

	list_for_each_entry(net, net_list, exit_list) {
		struct vxlan_net *vn = net_generic(net, vxlan_net_id);

		for (h = 0; h < PORT_HASH_SIZE; ++h)
			WARN_ON_ONCE(!hlist_empty(&vn->sock_list[h]));
	}
4777 4778
}

S
stephen hemminger 已提交
4779 4780
static struct pernet_operations vxlan_net_ops = {
	.init = vxlan_init_net,
4781
	.exit_batch = vxlan_exit_batch_net,
S
stephen hemminger 已提交
4782 4783 4784 4785 4786 4787 4788 4789 4790 4791
	.id   = &vxlan_net_id,
	.size = sizeof(struct vxlan_net),
};

static int __init vxlan_init_module(void)
{
	int rc;

	get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));

4792
	rc = register_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4793 4794 4795
	if (rc)
		goto out1;

4796
	rc = register_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4797 4798 4799
	if (rc)
		goto out2;

4800
	rc = register_switchdev_notifier(&vxlan_switchdev_notifier_block);
4801 4802
	if (rc)
		goto out3;
S
stephen hemminger 已提交
4803

4804 4805 4806 4807
	rc = rtnl_link_register(&vxlan_link_ops);
	if (rc)
		goto out4;

4808
	return 0;
4809 4810
out4:
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4811 4812
out3:
	unregister_netdevice_notifier(&vxlan_notifier_block);
S
stephen hemminger 已提交
4813
out2:
4814
	unregister_pernet_subsys(&vxlan_net_ops);
S
stephen hemminger 已提交
4815 4816 4817
out1:
	return rc;
}
4818
late_initcall(vxlan_init_module);
S
stephen hemminger 已提交
4819 4820 4821

static void __exit vxlan_cleanup_module(void)
{
4822
	rtnl_link_unregister(&vxlan_link_ops);
4823
	unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
4824
	unregister_netdevice_notifier(&vxlan_notifier_block);
4825 4826
	unregister_pernet_subsys(&vxlan_net_ops);
	/* rcu_barrier() is called by netns */
S
stephen hemminger 已提交
4827 4828 4829 4830 4831
}
module_exit(vxlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_VERSION(VXLAN_VERSION);
4832
MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
J
Jesse Brandeburg 已提交
4833
MODULE_DESCRIPTION("Driver for VXLAN encapsulated traffic");
S
stephen hemminger 已提交
4834
MODULE_ALIAS_RTNL_LINK("vxlan");