ipvlan_main.c 28.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of
 * the License, or (at your option) any later version.
 *
 */

#include "ipvlan.h"

12 13 14 15 16
static unsigned int ipvlan_netid __read_mostly;

struct ipvlan_netns {
	unsigned int ipvl_nf_hook_refcnt;
};
M
Mahesh Bandewar 已提交
17

18
static const struct nf_hook_ops ipvl_nfops[] = {
M
Mahesh Bandewar 已提交
19 20 21 22 23 24
	{
		.hook     = ipvlan_nf_input,
		.pf       = NFPROTO_IPV4,
		.hooknum  = NF_INET_LOCAL_IN,
		.priority = INT_MAX,
	},
M
Matteo Croce 已提交
25
#if IS_ENABLED(CONFIG_IPV6)
M
Mahesh Bandewar 已提交
26 27 28 29 30 31
	{
		.hook     = ipvlan_nf_input,
		.pf       = NFPROTO_IPV6,
		.hooknum  = NF_INET_LOCAL_IN,
		.priority = INT_MAX,
	},
M
Matteo Croce 已提交
32
#endif
M
Mahesh Bandewar 已提交
33 34
};

35
static const struct l3mdev_ops ipvl_l3mdev_ops = {
M
Mahesh Bandewar 已提交
36 37 38
	.l3mdev_l3_rcv = ipvlan_l3_rcv,
};

M
Mahesh Bandewar 已提交
39
static void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev)
40
{
41
	ipvlan->dev->mtu = dev->mtu;
42 43
}

44
static int ipvlan_register_nf_hook(struct net *net)
M
Mahesh Bandewar 已提交
45
{
46
	struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);
M
Mahesh Bandewar 已提交
47 48
	int err = 0;

49 50 51
	if (!vnet->ipvl_nf_hook_refcnt) {
		err = nf_register_net_hooks(net, ipvl_nfops,
					    ARRAY_SIZE(ipvl_nfops));
M
Mahesh Bandewar 已提交
52
		if (!err)
53
			vnet->ipvl_nf_hook_refcnt = 1;
M
Mahesh Bandewar 已提交
54
	} else {
55
		vnet->ipvl_nf_hook_refcnt++;
M
Mahesh Bandewar 已提交
56 57 58 59 60
	}

	return err;
}

61
static void ipvlan_unregister_nf_hook(struct net *net)
M
Mahesh Bandewar 已提交
62
{
63 64 65 66
	struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

	if (WARN_ON(!vnet->ipvl_nf_hook_refcnt))
		return;
M
Mahesh Bandewar 已提交
67

68 69 70 71
	vnet->ipvl_nf_hook_refcnt--;
	if (!vnet->ipvl_nf_hook_refcnt)
		nf_unregister_net_hooks(net, ipvl_nfops,
					ARRAY_SIZE(ipvl_nfops));
M
Mahesh Bandewar 已提交
72 73 74
}

static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval)
75 76
{
	struct ipvl_dev *ipvlan;
M
Mahesh Bandewar 已提交
77 78
	struct net_device *mdev = port->dev;
	int err = 0;
79

M
Mahesh Bandewar 已提交
80
	ASSERT_RTNL();
81
	if (port->mode != nval) {
M
Mahesh Bandewar 已提交
82 83
		if (nval == IPVLAN_MODE_L3S) {
			/* New mode is L3S */
84
			err = ipvlan_register_nf_hook(read_pnet(&port->pnet));
M
Mahesh Bandewar 已提交
85 86 87 88 89 90 91 92
			if (!err) {
				mdev->l3mdev_ops = &ipvl_l3mdev_ops;
				mdev->priv_flags |= IFF_L3MDEV_MASTER;
			} else
				return err;
		} else if (port->mode == IPVLAN_MODE_L3S) {
			/* Old mode was L3S */
			mdev->priv_flags &= ~IFF_L3MDEV_MASTER;
93
			ipvlan_unregister_nf_hook(read_pnet(&port->pnet));
M
Mahesh Bandewar 已提交
94 95
			mdev->l3mdev_ops = NULL;
		}
96
		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
M
Mahesh Bandewar 已提交
97
			if (nval == IPVLAN_MODE_L3 || nval == IPVLAN_MODE_L3S)
98 99 100 101 102 103
				ipvlan->dev->flags |= IFF_NOARP;
			else
				ipvlan->dev->flags &= ~IFF_NOARP;
		}
		port->mode = nval;
	}
M
Mahesh Bandewar 已提交
104
	return err;
105 106 107 108 109 110 111 112 113 114 115
}

static int ipvlan_port_create(struct net_device *dev)
{
	struct ipvl_port *port;
	int err, idx;

	port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL);
	if (!port)
		return -ENOMEM;

116
	write_pnet(&port->pnet, dev_net(dev));
117 118 119 120 121 122
	port->dev = dev;
	port->mode = IPVLAN_MODE_L3;
	INIT_LIST_HEAD(&port->ipvlans);
	for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++)
		INIT_HLIST_HEAD(&port->hlhead[idx]);

123 124
	skb_queue_head_init(&port->backlog);
	INIT_WORK(&port->wq, ipvlan_process_multicast);
125
	ida_init(&port->ida);
126
	port->dev_id_start = 1;
127

128 129 130 131 132 133 134
	err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port);
	if (err)
		goto err;

	return 0;

err:
135
	kfree(port);
136 137 138 139 140 141
	return err;
}

static void ipvlan_port_destroy(struct net_device *dev)
{
	struct ipvl_port *port = ipvlan_port_get_rtnl(dev);
142
	struct sk_buff *skb;
143

M
Mahesh Bandewar 已提交
144 145
	if (port->mode == IPVLAN_MODE_L3S) {
		dev->priv_flags &= ~IFF_L3MDEV_MASTER;
146
		ipvlan_unregister_nf_hook(dev_net(dev));
M
Mahesh Bandewar 已提交
147 148
		dev->l3mdev_ops = NULL;
	}
149
	netdev_rx_handler_unregister(dev);
150
	cancel_work_sync(&port->wq);
151 152 153 154 155
	while ((skb = __skb_dequeue(&port->backlog)) != NULL) {
		if (skb->dev)
			dev_put(skb->dev);
		kfree_skb(skb);
	}
156
	ida_destroy(&port->ida);
157
	kfree(port);
158 159 160
}

#define IPVLAN_FEATURES \
161
	(NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
162
	 NETIF_F_GSO | NETIF_F_TSO | NETIF_F_GSO_ROBUST | \
163 164 165 166 167 168 169 170 171
	 NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
	 NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER)

#define IPVLAN_STATE_MASK \
	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))

static int ipvlan_init(struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
172 173 174
	struct net_device *phy_dev = ipvlan->phy_dev;
	struct ipvl_port *port;
	int err;
175 176 177 178

	dev->state = (dev->state & ~IPVLAN_STATE_MASK) |
		     (phy_dev->state & IPVLAN_STATE_MASK);
	dev->features = phy_dev->features & IPVLAN_FEATURES;
179
	dev->features |= NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED;
180
	dev->gso_max_size = phy_dev->gso_max_size;
E
Eric Dumazet 已提交
181
	dev->gso_max_segs = phy_dev->gso_max_segs;
182 183
	dev->hard_header_len = phy_dev->hard_header_len;

184
	netdev_lockdep_set_classes(dev);
185

186
	ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats);
187 188 189
	if (!ipvlan->pcpu_stats)
		return -ENOMEM;

190 191 192 193 194 195 196 197
	if (!netif_is_ipvlan_port(phy_dev)) {
		err = ipvlan_port_create(phy_dev);
		if (err < 0) {
			free_percpu(ipvlan->pcpu_stats);
			return err;
		}
	}
	port = ipvlan_port_get_rtnl(phy_dev);
198
	port->count += 1;
199 200 201 202 203 204
	return 0;
}

static void ipvlan_uninit(struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
205 206
	struct net_device *phy_dev = ipvlan->phy_dev;
	struct ipvl_port *port;
207

208
	free_percpu(ipvlan->pcpu_stats);
209

210
	port = ipvlan_port_get_rtnl(phy_dev);
211 212 213 214 215 216 217 218 219 220 221
	port->count -= 1;
	if (!port->count)
		ipvlan_port_destroy(port->dev);
}

static int ipvlan_open(struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;
	struct ipvl_addr *addr;

M
Mahesh Bandewar 已提交
222 223
	if (ipvlan->port->mode == IPVLAN_MODE_L3 ||
	    ipvlan->port->mode == IPVLAN_MODE_L3S)
224 225 226 227
		dev->flags |= IFF_NOARP;
	else
		dev->flags &= ~IFF_NOARP;

228 229
	rcu_read_lock();
	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
230
		ipvlan_ht_addr_add(ipvlan, addr);
231
	rcu_read_unlock();
232

233 234 235 236 237 238 239 240 241 242 243 244 245 246
	return dev_uc_add(phy_dev, phy_dev->dev_addr);
}

static int ipvlan_stop(struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;
	struct ipvl_addr *addr;

	dev_uc_unsync(phy_dev, dev);
	dev_mc_unsync(phy_dev, dev);

	dev_uc_del(phy_dev, phy_dev->dev_addr);

247 248
	rcu_read_lock();
	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode)
249
		ipvlan_ht_addr_del(addr);
250
	rcu_read_unlock();
251

252 253 254
	return 0;
}

M
Mahesh Bandewar 已提交
255 256
static netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb,
				     struct net_device *dev)
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);
	int skblen = skb->len;
	int ret;

	ret = ipvlan_queue_xmit(skb, dev);
	if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
		struct ipvl_pcpu_stats *pcptr;

		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);

		u64_stats_update_begin(&pcptr->syncp);
		pcptr->tx_pkts++;
		pcptr->tx_bytes += skblen;
		u64_stats_update_end(&pcptr->syncp);
	} else {
		this_cpu_inc(ipvlan->pcpu_stats->tx_drps);
	}
	return ret;
}

static netdev_features_t ipvlan_fix_features(struct net_device *dev,
					     netdev_features_t features)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES);
}

static void ipvlan_change_rx_flags(struct net_device *dev, int change)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;

	if (change & IFF_ALLMULTI)
		dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1);
}

static void ipvlan_set_multicast_mac_filter(struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) {
		bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE);
	} else {
		struct netdev_hw_addr *ha;
		DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE);

		bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE);
		netdev_for_each_mc_addr(ha, dev)
			__set_bit(ipvlan_mac_hash(ha->addr), mc_filters);

309 310 311 312 313 314
		/* Turn-on broadcast bit irrespective of address family,
		 * since broadcast is deferred to a work-queue, hence no
		 * impact on fast-path processing.
		 */
		__set_bit(ipvlan_mac_hash(dev->broadcast), mc_filters);

315 316 317 318 319 320 321
		bitmap_copy(ipvlan->mac_filters, mc_filters,
			    IPVLAN_MAC_FILTER_SIZE);
	}
	dev_uc_sync(ipvlan->phy_dev, dev);
	dev_mc_sync(ipvlan->phy_dev, dev);
}

322 323
static void ipvlan_get_stats64(struct net_device *dev,
			       struct rtnl_link_stats64 *s)
324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	if (ipvlan->pcpu_stats) {
		struct ipvl_pcpu_stats *pcptr;
		u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes;
		u32 rx_errs = 0, tx_drps = 0;
		u32 strt;
		int idx;

		for_each_possible_cpu(idx) {
			pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx);
			do {
				strt= u64_stats_fetch_begin_irq(&pcptr->syncp);
				rx_pkts = pcptr->rx_pkts;
				rx_bytes = pcptr->rx_bytes;
				rx_mcast = pcptr->rx_mcast;
				tx_pkts = pcptr->tx_pkts;
				tx_bytes = pcptr->tx_bytes;
			} while (u64_stats_fetch_retry_irq(&pcptr->syncp,
							   strt));

			s->rx_packets += rx_pkts;
			s->rx_bytes += rx_bytes;
			s->multicast += rx_mcast;
			s->tx_packets += tx_pkts;
			s->tx_bytes += tx_bytes;

			/* u32 values are updated without syncp protection. */
			rx_errs += pcptr->rx_errs;
			tx_drps += pcptr->tx_drps;
		}
		s->rx_errors = rx_errs;
		s->rx_dropped = rx_errs;
		s->tx_dropped = tx_drps;
	}
}

static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;

	return vlan_vid_add(phy_dev, proto, vid);
}

static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto,
				   u16 vid)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;

	vlan_vid_del(phy_dev, proto, vid);
	return 0;
}

380 381 382 383 384 385 386
static int ipvlan_get_iflink(const struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	return ipvlan->phy_dev->ifindex;
}

387 388 389 390 391 392 393 394 395 396 397 398
static const struct net_device_ops ipvlan_netdev_ops = {
	.ndo_init		= ipvlan_init,
	.ndo_uninit		= ipvlan_uninit,
	.ndo_open		= ipvlan_open,
	.ndo_stop		= ipvlan_stop,
	.ndo_start_xmit		= ipvlan_start_xmit,
	.ndo_fix_features	= ipvlan_fix_features,
	.ndo_change_rx_flags	= ipvlan_change_rx_flags,
	.ndo_set_rx_mode	= ipvlan_set_multicast_mac_filter,
	.ndo_get_stats64	= ipvlan_get_stats64,
	.ndo_vlan_rx_add_vid	= ipvlan_vlan_rx_add_vid,
	.ndo_vlan_rx_kill_vid	= ipvlan_vlan_rx_kill_vid,
399
	.ndo_get_iflink		= ipvlan_get_iflink,
400 401 402 403 404 405 406 407 408 409 410 411 412 413
};

static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev,
			      unsigned short type, const void *daddr,
			      const void *saddr, unsigned len)
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct net_device *phy_dev = ipvlan->phy_dev;

	/* TODO Probably use a different field than dev_addr so that the
	 * mac-address on the virtual device is portable and can be carried
	 * while the packets use the mac-addr on the physical device.
	 */
	return dev_hard_header(skb, phy_dev, type, daddr,
414
			       saddr ? : phy_dev->dev_addr, len);
415 416 417 418 419 420 421 422 423
}

static const struct header_ops ipvlan_header_ops = {
	.create  	= ipvlan_hard_header,
	.parse		= eth_header_parse,
	.cache		= eth_header_cache,
	.cache_update	= eth_header_cache_update,
};

P
Paolo Abeni 已提交
424 425 426 427 428 429
static bool netif_is_ipvlan(const struct net_device *dev)
{
	/* both ipvlan and ipvtap devices use the same netdev_ops */
	return dev->netdev_ops == &ipvlan_netdev_ops;
}

430 431
static int ipvlan_ethtool_get_link_ksettings(struct net_device *dev,
					     struct ethtool_link_ksettings *cmd)
432 433 434
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);

435
	return __ethtool_get_link_ksettings(ipvlan->phy_dev, cmd);
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
}

static void ipvlan_ethtool_get_drvinfo(struct net_device *dev,
				       struct ethtool_drvinfo *drvinfo)
{
	strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver));
	strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version));
}

static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev)
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);

	return ipvlan->msg_enable;
}

static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	ipvlan->msg_enable = value;
}

static const struct ethtool_ops ipvlan_ethtool_ops = {
	.get_link	= ethtool_op_get_link,
461
	.get_link_ksettings	= ipvlan_ethtool_get_link_ksettings,
462 463 464 465 466 467
	.get_drvinfo	= ipvlan_ethtool_get_drvinfo,
	.get_msglevel	= ipvlan_ethtool_get_msglevel,
	.set_msglevel	= ipvlan_ethtool_set_msglevel,
};

static int ipvlan_nl_changelink(struct net_device *dev,
468 469
				struct nlattr *tb[], struct nlattr *data[],
				struct netlink_ext_ack *extack)
470 471 472
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
M
Mahesh Bandewar 已提交
473
	int err = 0;
474

475 476 477 478
	if (!data)
		return 0;

	if (data[IFLA_IPVLAN_MODE]) {
479 480
		u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]);

M
Mahesh Bandewar 已提交
481
		err = ipvlan_set_port_mode(port, nmode);
482
	}
483 484 485 486 487 488 489 490

	if (!err && data[IFLA_IPVLAN_FLAGS]) {
		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);

		if (flags & IPVLAN_F_PRIVATE)
			ipvlan_mark_private(port);
		else
			ipvlan_clear_private(port);
M
Mahesh Bandewar 已提交
491 492 493 494 495

		if (flags & IPVLAN_F_VEPA)
			ipvlan_mark_vepa(port);
		else
			ipvlan_clear_vepa(port);
496 497
	}

M
Mahesh Bandewar 已提交
498
	return err;
499 500 501 502 503 504
}

static size_t ipvlan_nl_getsize(const struct net_device *dev)
{
	return (0
		+ nla_total_size(2) /* IFLA_IPVLAN_MODE */
505
		+ nla_total_size(2) /* IFLA_IPVLAN_FLAGS */
506 507 508
		);
}

509 510
static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[],
			      struct netlink_ext_ack *extack)
511
{
512 513 514 515
	if (!data)
		return 0;

	if (data[IFLA_IPVLAN_MODE]) {
516 517 518 519 520
		u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);

		if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX)
			return -EINVAL;
	}
521 522 523
	if (data[IFLA_IPVLAN_FLAGS]) {
		u16 flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);

M
Mahesh Bandewar 已提交
524 525 526 527 528 529
		/* Only two bits are used at this moment. */
		if (flags & ~(IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
			return -EINVAL;
		/* Also both flags can't be active at the same time. */
		if ((flags & (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA)) ==
		    (IPVLAN_F_PRIVATE | IPVLAN_F_VEPA))
530 531 532
			return -EINVAL;
	}

533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548
	return 0;
}

static int ipvlan_nl_fillinfo(struct sk_buff *skb,
			      const struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev);
	int ret = -EINVAL;

	if (!port)
		goto err;

	ret = -EMSGSIZE;
	if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode))
		goto err;
549 550
	if (nla_put_u16(skb, IFLA_IPVLAN_FLAGS, port->flags))
		goto err;
551 552 553 554 555 556 557

	return 0;

err:
	return ret;
}

558
int ipvlan_link_new(struct net *src_net, struct net_device *dev,
559 560
		    struct nlattr *tb[], struct nlattr *data[],
		    struct netlink_ext_ack *extack)
561 562 563 564 565
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct ipvl_port *port;
	struct net_device *phy_dev;
	int err;
M
Mahesh Bandewar 已提交
566
	u16 mode = IPVLAN_MODE_L3;
567 568 569 570 571 572 573 574

	if (!tb[IFLA_LINK])
		return -EINVAL;

	phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
	if (!phy_dev)
		return -ENODEV;

575
	if (netif_is_ipvlan(phy_dev)) {
576 577 578
		struct ipvl_dev *tmp = netdev_priv(phy_dev);

		phy_dev = tmp->phy_dev;
579
	} else if (!netif_is_ipvlan_port(phy_dev)) {
580 581 582 583 584 585 586
		/* Exit early if the underlying link is invalid or busy */
		if (phy_dev->type != ARPHRD_ETHER ||
		    phy_dev->flags & IFF_LOOPBACK) {
			netdev_err(phy_dev,
				   "Master is either lo or non-ether device\n");
			return -EINVAL;
		}
587

588 589 590 591 592
		if (netdev_is_rx_handler_busy(phy_dev)) {
			netdev_err(phy_dev, "Device is already in use.\n");
			return -EBUSY;
		}
	}
593 594 595 596

	ipvlan->phy_dev = phy_dev;
	ipvlan->dev = dev;
	ipvlan->sfeatures = IPVLAN_FEATURES;
597 598
	if (!tb[IFLA_MTU])
		ipvlan_adjust_mtu(ipvlan, phy_dev);
599
	INIT_LIST_HEAD(&ipvlan->addrs);
600
	spin_lock_init(&ipvlan->addrs_lock);
601

602 603 604
	/* TODO Probably put random address here to be presented to the
	 * world but keep using the physical-dev address for the outgoing
	 * packets.
605
	 */
606 607
	memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN);

P
Paolo Abeni 已提交
608 609
	dev->priv_flags |= IFF_NO_RX_HANDLER;

610 611 612 613 614 615 616
	err = register_netdevice(dev);
	if (err < 0)
		return err;

	/* ipvlan_init() would have created the port, if required */
	port = ipvlan_port_get_rtnl(phy_dev);
	ipvlan->port = port;
617

618 619 620 621 622 623 624
	/* If the port-id base is at the MAX value, then wrap it around and
	 * begin from 0x1 again. This may be due to a busy system where lots
	 * of slaves are getting created and deleted.
	 */
	if (port->dev_id_start == 0xFFFE)
		port->dev_id_start = 0x1;

625 626 627 628 629
	/* Since L2 address is shared among all IPvlan slaves including
	 * master, use unique 16 bit dev-ids to diffentiate among them.
	 * Assign IDs between 0x1 and 0xFFFE (used by the master) to each
	 * slave link [see addrconf_ifid_eui48()].
	 */
630 631
	err = ida_simple_get(&port->ida, port->dev_id_start, 0xFFFE,
			     GFP_KERNEL);
632 633 634
	if (err < 0)
		err = ida_simple_get(&port->ida, 0x1, port->dev_id_start,
				     GFP_KERNEL);
635
	if (err < 0)
636
		goto unregister_netdev;
637
	dev->dev_id = err;
638

639 640
	/* Increment id-base to the next slot for the future assignment */
	port->dev_id_start = err + 1;
641

642 643 644
	err = netdev_upper_dev_link(phy_dev, dev, extack);
	if (err)
		goto remove_ida;
645

646 647 648 649 650
	/* Flags are per port and latest update overrides. User has
	 * to be consistent in setting it just like the mode attribute.
	 */
	if (data && data[IFLA_IPVLAN_FLAGS])
		port->flags = nla_get_u16(data[IFLA_IPVLAN_FLAGS]);
651

652 653
	if (data && data[IFLA_IPVLAN_MODE])
		mode = nla_get_u16(data[IFLA_IPVLAN_MODE]);
654

M
Mahesh Bandewar 已提交
655
	err = ipvlan_set_port_mode(port, mode);
656
	if (err)
657
		goto unlink_netdev;
658 659 660 661

	list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans);
	netif_stacked_transfer_operstate(phy_dev, dev);
	return 0;
662

663 664
unlink_netdev:
	netdev_upper_dev_unlink(phy_dev, dev);
665 666
remove_ida:
	ida_simple_remove(&port->ida, dev->dev_id);
667 668
unregister_netdev:
	unregister_netdevice(dev);
669
	return err;
670
}
671
EXPORT_SYMBOL_GPL(ipvlan_link_new);
672

673
void ipvlan_link_delete(struct net_device *dev, struct list_head *head)
674 675 676 677
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct ipvl_addr *addr, *next;

678
	spin_lock_bh(&ipvlan->addrs_lock);
679
	list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) {
680
		ipvlan_ht_addr_del(addr);
681
		list_del_rcu(&addr->anode);
682
		kfree_rcu(addr, rcu);
683
	}
684
	spin_unlock_bh(&ipvlan->addrs_lock);
685

686
	ida_simple_remove(&ipvlan->port->ida, dev->dev_id);
687 688 689 690
	list_del_rcu(&ipvlan->pnode);
	unregister_netdevice_queue(dev, head);
	netdev_upper_dev_unlink(ipvlan->phy_dev, dev);
}
691
EXPORT_SYMBOL_GPL(ipvlan_link_delete);
692

693
void ipvlan_link_setup(struct net_device *dev)
694 695 696
{
	ether_setup(dev);

X
Xin Long 已提交
697
	dev->max_mtu = ETH_MAX_MTU;
698
	dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING);
699
	dev->priv_flags |= IFF_UNICAST_FLT | IFF_NO_QUEUE;
700
	dev->netdev_ops = &ipvlan_netdev_ops;
701
	dev->needs_free_netdev = true;
702 703 704
	dev->header_ops = &ipvlan_header_ops;
	dev->ethtool_ops = &ipvlan_ethtool_ops;
}
705
EXPORT_SYMBOL_GPL(ipvlan_link_setup);
706 707 708 709

static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] =
{
	[IFLA_IPVLAN_MODE] = { .type = NLA_U16 },
710
	[IFLA_IPVLAN_FLAGS] = { .type = NLA_U16 },
711 712 713 714 715 716 717 718 719 720 721
};

static struct rtnl_link_ops ipvlan_link_ops = {
	.kind		= "ipvlan",
	.priv_size	= sizeof(struct ipvl_dev),

	.setup		= ipvlan_link_setup,
	.newlink	= ipvlan_link_new,
	.dellink	= ipvlan_link_delete,
};

722
int ipvlan_link_register(struct rtnl_link_ops *ops)
723
{
724 725 726 727 728 729
	ops->get_size	= ipvlan_nl_getsize;
	ops->policy	= ipvlan_nl_policy;
	ops->validate	= ipvlan_nl_validate;
	ops->fill_info	= ipvlan_nl_fillinfo;
	ops->changelink = ipvlan_nl_changelink;
	ops->maxtype	= IFLA_IPVLAN_MAX;
730 731
	return rtnl_link_register(ops);
}
732
EXPORT_SYMBOL_GPL(ipvlan_link_register);
733 734 735 736 737 738 739 740 741

static int ipvlan_device_event(struct notifier_block *unused,
			       unsigned long event, void *ptr)
{
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
	struct ipvl_dev *ipvlan, *next;
	struct ipvl_port *port;
	LIST_HEAD(lst_kill);

742
	if (!netif_is_ipvlan_port(dev))
743 744 745 746 747 748 749 750 751 752 753
		return NOTIFY_DONE;

	port = ipvlan_port_get_rtnl(dev);

	switch (event) {
	case NETDEV_CHANGE:
		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
			netif_stacked_transfer_operstate(ipvlan->phy_dev,
							 ipvlan->dev);
		break;

754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771
	case NETDEV_REGISTER: {
		struct net *oldnet, *newnet = dev_net(dev);
		struct ipvlan_netns *old_vnet;

		oldnet = read_pnet(&port->pnet);
		if (net_eq(newnet, oldnet))
			break;

		write_pnet(&port->pnet, newnet);

		old_vnet = net_generic(oldnet, ipvlan_netid);
		if (!old_vnet->ipvl_nf_hook_refcnt)
			break;

		ipvlan_register_nf_hook(newnet);
		ipvlan_unregister_nf_hook(oldnet);
		break;
	}
772 773 774 775
	case NETDEV_UNREGISTER:
		if (dev->reg_state != NETREG_UNREGISTERING)
			break;

776
		list_for_each_entry_safe(ipvlan, next, &port->ipvlans, pnode)
777 778 779 780 781 782 783 784 785
			ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev,
							    &lst_kill);
		unregister_netdevice_many(&lst_kill);
		break;

	case NETDEV_FEAT_CHANGE:
		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
			ipvlan->dev->features = dev->features & IPVLAN_FEATURES;
			ipvlan->dev->gso_max_size = dev->gso_max_size;
E
Eric Dumazet 已提交
786
			ipvlan->dev->gso_max_segs = dev->gso_max_segs;
787 788 789 790 791 792 793 794 795
			netdev_features_change(ipvlan->dev);
		}
		break;

	case NETDEV_CHANGEMTU:
		list_for_each_entry(ipvlan, &port->ipvlans, pnode)
			ipvlan_adjust_mtu(ipvlan, dev);
		break;

796
	case NETDEV_CHANGEADDR:
797
		list_for_each_entry(ipvlan, &port->ipvlans, pnode) {
798
			ether_addr_copy(ipvlan->dev->dev_addr, dev->dev_addr);
799 800
			call_netdevice_notifiers(NETDEV_CHANGEADDR, ipvlan->dev);
		}
801 802
		break;

803 804 805 806 807 808 809
	case NETDEV_PRE_TYPE_CHANGE:
		/* Forbid underlying device to change its type. */
		return NOTIFY_BAD;
	}
	return NOTIFY_DONE;
}

810
/* the caller must held the addrs lock */
811
static int ipvlan_add_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
812 813 814 815 816 817 818 819
{
	struct ipvl_addr *addr;

	addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC);
	if (!addr)
		return -ENOMEM;

	addr->master = ipvlan;
M
Matteo Croce 已提交
820
	if (!is_v6) {
821 822
		memcpy(&addr->ip4addr, iaddr, sizeof(struct in_addr));
		addr->atype = IPVL_IPV4;
M
Matteo Croce 已提交
823 824 825 826 827
#if IS_ENABLED(CONFIG_IPV6)
	} else {
		memcpy(&addr->ip6addr, iaddr, sizeof(struct in6_addr));
		addr->atype = IPVL_IPV6;
#endif
828
	}
829 830

	list_add_tail_rcu(&addr->anode, &ipvlan->addrs);
831

832 833 834 835 836
	/* If the interface is not up, the address will be added to the hash
	 * list by ipvlan_open.
	 */
	if (netif_running(ipvlan->dev))
		ipvlan_ht_addr_add(ipvlan, addr);
837 838 839 840

	return 0;
}

841
static void ipvlan_del_addr(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6)
842 843 844
{
	struct ipvl_addr *addr;

845
	spin_lock_bh(&ipvlan->addrs_lock);
846
	addr = ipvlan_find_addr(ipvlan, iaddr, is_v6);
847 848
	if (!addr) {
		spin_unlock_bh(&ipvlan->addrs_lock);
849
		return;
850
	}
851

852
	ipvlan_ht_addr_del(addr);
853 854
	list_del_rcu(&addr->anode);
	spin_unlock_bh(&ipvlan->addrs_lock);
855 856 857
	kfree_rcu(addr, rcu);
}

M
Matteo Croce 已提交
858 859 860 861 862 863 864 865 866 867 868 869 870 871
static bool ipvlan_is_valid_dev(const struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);

	if (!netif_is_ipvlan(dev))
		return false;

	if (!ipvlan || !ipvlan->port)
		return false;

	return true;
}

#if IS_ENABLED(CONFIG_IPV6)
872 873
static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
{
874 875 876 877
	int ret = -EINVAL;

	spin_lock_bh(&ipvlan->addrs_lock);
	if (ipvlan_addr_busy(ipvlan->port, ip6_addr, true))
878 879 880
		netif_err(ipvlan, ifup, ipvlan->dev,
			  "Failed to add IPv6=%pI6c addr for %s intf\n",
			  ip6_addr, ipvlan->dev->name);
881 882 883 884
	else
		ret = ipvlan_add_addr(ipvlan, ip6_addr, true);
	spin_unlock_bh(&ipvlan->addrs_lock);
	return ret;
885 886 887 888 889 890 891
}

static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr)
{
	return ipvlan_del_addr(ipvlan, ip6_addr, true);
}

892 893 894 895 896 897 898
static int ipvlan_addr6_event(struct notifier_block *unused,
			      unsigned long event, void *ptr)
{
	struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr;
	struct net_device *dev = (struct net_device *)if6->idev->dev;
	struct ipvl_dev *ipvlan = netdev_priv(dev);

899
	if (!ipvlan_is_valid_dev(dev))
900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		if (ipvlan_add_addr6(ipvlan, &if6->addr))
			return NOTIFY_BAD;
		break;

	case NETDEV_DOWN:
		ipvlan_del_addr6(ipvlan, &if6->addr);
		break;
	}

	return NOTIFY_OK;
}

916 917 918 919 920 921 922
static int ipvlan_addr6_validator_event(struct notifier_block *unused,
					unsigned long event, void *ptr)
{
	struct in6_validator_info *i6vi = (struct in6_validator_info *)ptr;
	struct net_device *dev = (struct net_device *)i6vi->i6vi_dev->dev;
	struct ipvl_dev *ipvlan = netdev_priv(dev);

923
	if (!ipvlan_is_valid_dev(dev))
924 925 926 927
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
928 929 930
		if (ipvlan_addr_busy(ipvlan->port, &i6vi->i6vi_addr, true)) {
			NL_SET_ERR_MSG(i6vi->extack,
				       "Address already assigned to an ipvlan device");
931
			return notifier_from_errno(-EADDRINUSE);
932
		}
933 934 935 936 937
		break;
	}

	return NOTIFY_OK;
}
M
Matteo Croce 已提交
938
#endif
939

940 941
static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
{
942 943 944 945
	int ret = -EINVAL;

	spin_lock_bh(&ipvlan->addrs_lock);
	if (ipvlan_addr_busy(ipvlan->port, ip4_addr, false))
946 947 948
		netif_err(ipvlan, ifup, ipvlan->dev,
			  "Failed to add IPv4=%pI4 on %s intf.\n",
			  ip4_addr, ipvlan->dev->name);
949 950 951 952
	else
		ret = ipvlan_add_addr(ipvlan, ip4_addr, false);
	spin_unlock_bh(&ipvlan->addrs_lock);
	return ret;
953 954 955 956
}

static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr)
{
957
	return ipvlan_del_addr(ipvlan, ip4_addr, false);
958 959 960 961 962 963 964 965 966 967
}

static int ipvlan_addr4_event(struct notifier_block *unused,
			      unsigned long event, void *ptr)
{
	struct in_ifaddr *if4 = (struct in_ifaddr *)ptr;
	struct net_device *dev = (struct net_device *)if4->ifa_dev->dev;
	struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct in_addr ip4_addr;

968
	if (!ipvlan_is_valid_dev(dev))
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
		ip4_addr.s_addr = if4->ifa_address;
		if (ipvlan_add_addr4(ipvlan, &ip4_addr))
			return NOTIFY_BAD;
		break;

	case NETDEV_DOWN:
		ip4_addr.s_addr = if4->ifa_address;
		ipvlan_del_addr4(ipvlan, &ip4_addr);
		break;
	}

	return NOTIFY_OK;
}

987 988 989 990 991 992 993
static int ipvlan_addr4_validator_event(struct notifier_block *unused,
					unsigned long event, void *ptr)
{
	struct in_validator_info *ivi = (struct in_validator_info *)ptr;
	struct net_device *dev = (struct net_device *)ivi->ivi_dev->dev;
	struct ipvl_dev *ipvlan = netdev_priv(dev);

994
	if (!ipvlan_is_valid_dev(dev))
995 996 997 998
		return NOTIFY_DONE;

	switch (event) {
	case NETDEV_UP:
999 1000 1001
		if (ipvlan_addr_busy(ipvlan->port, &ivi->ivi_addr, false)) {
			NL_SET_ERR_MSG(ivi->extack,
				       "Address already assigned to an ipvlan device");
1002
			return notifier_from_errno(-EADDRINUSE);
1003
		}
1004 1005 1006 1007 1008 1009
		break;
	}

	return NOTIFY_OK;
}

1010 1011 1012 1013
static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = {
	.notifier_call = ipvlan_addr4_event,
};

1014 1015 1016 1017
static struct notifier_block ipvlan_addr4_vtor_notifier_block __read_mostly = {
	.notifier_call = ipvlan_addr4_validator_event,
};

1018 1019 1020 1021
static struct notifier_block ipvlan_notifier_block __read_mostly = {
	.notifier_call = ipvlan_device_event,
};

M
Matteo Croce 已提交
1022
#if IS_ENABLED(CONFIG_IPV6)
1023 1024 1025 1026
static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = {
	.notifier_call = ipvlan_addr6_event,
};

1027 1028 1029
static struct notifier_block ipvlan_addr6_vtor_notifier_block __read_mostly = {
	.notifier_call = ipvlan_addr6_validator_event,
};
M
Matteo Croce 已提交
1030
#endif
1031

1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
static void ipvlan_ns_exit(struct net *net)
{
	struct ipvlan_netns *vnet = net_generic(net, ipvlan_netid);

	if (WARN_ON_ONCE(vnet->ipvl_nf_hook_refcnt)) {
		vnet->ipvl_nf_hook_refcnt = 0;
		nf_unregister_net_hooks(net, ipvl_nfops,
					ARRAY_SIZE(ipvl_nfops));
	}
}

static struct pernet_operations ipvlan_net_ops = {
	.id = &ipvlan_netid,
	.size = sizeof(struct ipvlan_netns),
	.exit = ipvlan_ns_exit,
};

1049 1050 1051 1052 1053 1054
static int __init ipvlan_init_module(void)
{
	int err;

	ipvlan_init_secret();
	register_netdevice_notifier(&ipvlan_notifier_block);
M
Matteo Croce 已提交
1055
#if IS_ENABLED(CONFIG_IPV6)
1056
	register_inet6addr_notifier(&ipvlan_addr6_notifier_block);
1057 1058
	register_inet6addr_validator_notifier(
	    &ipvlan_addr6_vtor_notifier_block);
M
Matteo Croce 已提交
1059
#endif
1060
	register_inetaddr_notifier(&ipvlan_addr4_notifier_block);
1061
	register_inetaddr_validator_notifier(&ipvlan_addr4_vtor_notifier_block);
1062

1063
	err = register_pernet_subsys(&ipvlan_net_ops);
1064 1065 1066
	if (err < 0)
		goto error;

1067 1068 1069 1070 1071 1072
	err = ipvlan_link_register(&ipvlan_link_ops);
	if (err < 0) {
		unregister_pernet_subsys(&ipvlan_net_ops);
		goto error;
	}

1073 1074 1075
	return 0;
error:
	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
1076 1077
	unregister_inetaddr_validator_notifier(
	    &ipvlan_addr4_vtor_notifier_block);
M
Matteo Croce 已提交
1078
#if IS_ENABLED(CONFIG_IPV6)
1079
	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
1080 1081
	unregister_inet6addr_validator_notifier(
	    &ipvlan_addr6_vtor_notifier_block);
M
Matteo Croce 已提交
1082
#endif
1083 1084 1085 1086 1087 1088 1089
	unregister_netdevice_notifier(&ipvlan_notifier_block);
	return err;
}

static void __exit ipvlan_cleanup_module(void)
{
	rtnl_link_unregister(&ipvlan_link_ops);
1090
	unregister_pernet_subsys(&ipvlan_net_ops);
1091 1092
	unregister_netdevice_notifier(&ipvlan_notifier_block);
	unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block);
1093 1094
	unregister_inetaddr_validator_notifier(
	    &ipvlan_addr4_vtor_notifier_block);
M
Matteo Croce 已提交
1095
#if IS_ENABLED(CONFIG_IPV6)
1096
	unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block);
1097 1098
	unregister_inet6addr_validator_notifier(
	    &ipvlan_addr6_vtor_notifier_block);
M
Matteo Croce 已提交
1099
#endif
1100 1101 1102 1103 1104 1105 1106 1107 1108
}

module_init(ipvlan_init_module);
module_exit(ipvlan_cleanup_module);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>");
MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs");
MODULE_ALIAS_RTNL_LINK("ipvlan");