ipvlan_core.c 19.2 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11
/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation; either version 2 of
 * the License, or (at your option) any later version.
 *
 */

#include "ipvlan.h"

12
static u32 ipvlan_jhash_secret __read_mostly;
13 14 15 16 17 18

void ipvlan_init_secret(void)
{
	net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret));
}

19
void ipvlan_count_rx(const struct ipvl_dev *ipvlan,
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
			    unsigned int len, bool success, bool mcast)
{
	if (likely(success)) {
		struct ipvl_pcpu_stats *pcptr;

		pcptr = this_cpu_ptr(ipvlan->pcpu_stats);
		u64_stats_update_begin(&pcptr->syncp);
		pcptr->rx_pkts++;
		pcptr->rx_bytes += len;
		if (mcast)
			pcptr->rx_mcast++;
		u64_stats_update_end(&pcptr->syncp);
	} else {
		this_cpu_inc(ipvlan->pcpu_stats->rx_errs);
	}
}
36
EXPORT_SYMBOL_GPL(ipvlan_count_rx);
37

M
Matteo Croce 已提交
38
#if IS_ENABLED(CONFIG_IPV6)
39 40 41 42 43 44 45
static u8 ipvlan_get_v6_hash(const void *iaddr)
{
	const struct in6_addr *ip6_addr = iaddr;

	return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) &
	       IPVLAN_HASH_MASK;
}
M
Matteo Croce 已提交
46 47 48 49 50 51
#else
static u8 ipvlan_get_v6_hash(const void *iaddr)
{
	return 0;
}
#endif
52 53 54 55 56 57 58 59 60

static u8 ipvlan_get_v4_hash(const void *iaddr)
{
	const struct in_addr *ip4_addr = iaddr;

	return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) &
	       IPVLAN_HASH_MASK;
}

M
Matteo Croce 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
static bool addr_equal(bool is_v6, struct ipvl_addr *addr, const void *iaddr)
{
	if (!is_v6 && addr->atype == IPVL_IPV4) {
		struct in_addr *i4addr = (struct in_addr *)iaddr;

		return addr->ip4addr.s_addr == i4addr->s_addr;
#if IS_ENABLED(CONFIG_IPV6)
	} else if (is_v6 && addr->atype == IPVL_IPV6) {
		struct in6_addr *i6addr = (struct in6_addr *)iaddr;

		return ipv6_addr_equal(&addr->ip6addr, i6addr);
#endif
	}

	return false;
}

M
Mahesh Bandewar 已提交
78 79
static struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port,
					       const void *iaddr, bool is_v6)
80 81 82 83 84 85
{
	struct ipvl_addr *addr;
	u8 hash;

	hash = is_v6 ? ipvlan_get_v6_hash(iaddr) :
	       ipvlan_get_v4_hash(iaddr);
M
Matteo Croce 已提交
86 87
	hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode)
		if (addr_equal(is_v6, addr, iaddr))
88 89 90 91 92 93 94 95 96 97 98 99
			return addr;
	return NULL;
}

void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr)
{
	struct ipvl_port *port = ipvlan->port;
	u8 hash;

	hash = (addr->atype == IPVL_IPV6) ?
	       ipvlan_get_v6_hash(&addr->ip6addr) :
	       ipvlan_get_v4_hash(&addr->ip4addr);
100 101
	if (hlist_unhashed(&addr->hlnode))
		hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]);
102 103
}

104
void ipvlan_ht_addr_del(struct ipvl_addr *addr)
105
{
106
	hlist_del_init_rcu(&addr->hlnode);
107 108
}

109 110
struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan,
				   const void *iaddr, bool is_v6)
111
{
112 113 114 115 116 117 118 119 120 121 122
	struct ipvl_addr *addr, *ret = NULL;

	rcu_read_lock();
	list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) {
		if (addr_equal(is_v6, addr, iaddr)) {
			ret = addr;
			break;
		}
	}
	rcu_read_unlock();
	return ret;
123
}
124

125 126 127
bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6)
{
	struct ipvl_dev *ipvlan;
128 129 130 131 132 133 134 135
	bool ret = false;

	rcu_read_lock();
	list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
		if (ipvlan_find_addr(ipvlan, iaddr, is_v6)) {
			ret = true;
			break;
		}
136
	}
137 138
	rcu_read_unlock();
	return ret;
139 140
}

141
static void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type)
142 143 144 145 146 147 148
{
	void *lyr3h = NULL;

	switch (skb->protocol) {
	case htons(ETH_P_ARP): {
		struct arphdr *arph;

149
		if (unlikely(!pskb_may_pull(skb, arp_hdr_len(port->dev))))
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174
			return NULL;

		arph = arp_hdr(skb);
		*type = IPVL_ARP;
		lyr3h = arph;
		break;
	}
	case htons(ETH_P_IP): {
		u32 pktlen;
		struct iphdr *ip4h;

		if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h))))
			return NULL;

		ip4h = ip_hdr(skb);
		pktlen = ntohs(ip4h->tot_len);
		if (ip4h->ihl < 5 || ip4h->version != 4)
			return NULL;
		if (skb->len < pktlen || pktlen < (ip4h->ihl * 4))
			return NULL;

		*type = IPVL_IPV4;
		lyr3h = ip4h;
		break;
	}
M
Matteo Croce 已提交
175
#if IS_ENABLED(CONFIG_IPV6)
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
	case htons(ETH_P_IPV6): {
		struct ipv6hdr *ip6h;

		if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h))))
			return NULL;

		ip6h = ipv6_hdr(skb);
		if (ip6h->version != 6)
			return NULL;

		*type = IPVL_IPV6;
		lyr3h = ip6h;
		/* Only Neighbour Solicitation pkts need different treatment */
		if (ipv6_addr_any(&ip6h->saddr) &&
		    ip6h->nexthdr == NEXTHDR_ICMP) {
191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
			struct icmp6hdr	*icmph;

			if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph))))
				return NULL;

			ip6h = ipv6_hdr(skb);
			icmph = (struct icmp6hdr *)(ip6h + 1);

			if (icmph->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) {
				/* Need to access the ipv6 address in body */
				if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h) + sizeof(*icmph)
						+ sizeof(struct in6_addr))))
					return NULL;

				ip6h = ipv6_hdr(skb);
				icmph = (struct icmp6hdr *)(ip6h + 1);
			}

209
			*type = IPVL_ICMPV6;
210
			lyr3h = icmph;
211 212 213
		}
		break;
	}
M
Matteo Croce 已提交
214
#endif
215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
	default:
		return NULL;
	}

	return lyr3h;
}

unsigned int ipvlan_mac_hash(const unsigned char *addr)
{
	u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2),
			       ipvlan_jhash_secret);

	return hash & IPVLAN_MAC_FILTER_MASK;
}

230
void ipvlan_process_multicast(struct work_struct *work)
231
{
232 233
	struct ipvl_port *port = container_of(work, struct ipvl_port, wq);
	struct ethhdr *ethh;
234
	struct ipvl_dev *ipvlan;
235 236
	struct sk_buff *skb, *nskb;
	struct sk_buff_head list;
237 238 239
	unsigned int len;
	unsigned int mac_hash;
	int ret;
240
	u8 pkt_type;
M
Mahesh Bandewar 已提交
241
	bool tx_pkt;
242

243
	__skb_queue_head_init(&list);
244

245 246 247
	spin_lock_bh(&port->backlog.lock);
	skb_queue_splice_tail_init(&port->backlog, &list);
	spin_unlock_bh(&port->backlog.lock);
248

249
	while ((skb = __skb_dequeue(&list)) != NULL) {
250 251 252
		struct net_device *dev = skb->dev;
		bool consumed = false;

253
		ethh = eth_hdr(skb);
M
Mahesh Bandewar 已提交
254
		tx_pkt = IPVL_SKB_CB(skb)->tx_pkt;
255
		mac_hash = ipvlan_mac_hash(ethh->h_dest);
256

257 258
		if (ether_addr_equal(ethh->h_dest, port->dev->broadcast))
			pkt_type = PACKET_BROADCAST;
259
		else
260 261 262 263
			pkt_type = PACKET_MULTICAST;

		rcu_read_lock();
		list_for_each_entry_rcu(ipvlan, &port->ipvlans, pnode) {
M
Mahesh Bandewar 已提交
264
			if (tx_pkt && (ipvlan->dev == skb->dev))
265 266 267
				continue;
			if (!test_bit(mac_hash, ipvlan->mac_filters))
				continue;
268 269
			if (!(ipvlan->dev->flags & IFF_UP))
				continue;
270 271 272
			ret = NET_RX_DROP;
			len = skb->len + ETH_HLEN;
			nskb = skb_clone(skb, GFP_ATOMIC);
273 274 275 276 277
			local_bh_disable();
			if (nskb) {
				consumed = true;
				nskb->pkt_type = pkt_type;
				nskb->dev = ipvlan->dev;
M
Mahesh Bandewar 已提交
278
				if (tx_pkt)
279 280 281 282
					ret = dev_forward_skb(ipvlan->dev, nskb);
				else
					ret = netif_rx(nskb);
			}
283
			ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true);
284
			local_bh_enable();
285 286 287
		}
		rcu_read_unlock();

M
Mahesh Bandewar 已提交
288
		if (tx_pkt) {
289 290 291 292 293
			/* If the packet originated here, send it out. */
			skb->dev = port->dev;
			skb->pkt_type = pkt_type;
			dev_queue_xmit(skb);
		} else {
294 295 296 297
			if (consumed)
				consume_skb(skb);
			else
				kfree_skb(skb);
298
		}
299 300
		if (dev)
			dev_put(dev);
301 302 303
	}
}

304 305 306 307 308 309 310 311 312 313 314 315
static void ipvlan_skb_crossing_ns(struct sk_buff *skb, struct net_device *dev)
{
	bool xnet = true;

	if (dev)
		xnet = !net_eq(dev_net(skb->dev), dev_net(dev));

	skb_scrub_packet(skb, xnet);
	if (dev)
		skb->dev = dev;
}

316
static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff **pskb,
317 318 319 320 321 322 323
			    bool local)
{
	struct ipvl_dev *ipvlan = addr->master;
	struct net_device *dev = ipvlan->dev;
	unsigned int len;
	rx_handler_result_t ret = RX_HANDLER_CONSUMED;
	bool success = false;
324
	struct sk_buff *skb = *pskb;
325 326

	len = skb->len + ETH_HLEN;
M
Mahesh Bandewar 已提交
327 328 329 330 331 332 333 334
	/* Only packets exchanged between two local slaves need to have
	 * device-up check as well as skb-share check.
	 */
	if (local) {
		if (unlikely(!(dev->flags & IFF_UP))) {
			kfree_skb(skb);
			goto out;
		}
335

M
Mahesh Bandewar 已提交
336 337 338
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (!skb)
			goto out;
339

M
Mahesh Bandewar 已提交
340 341
		*pskb = skb;
	}
342 343

	if (local) {
M
Mahesh Bandewar 已提交
344
		skb->pkt_type = PACKET_HOST;
345 346 347
		if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS)
			success = true;
	} else {
348
		skb->dev = dev;
349 350 351 352 353 354 355 356 357 358 359 360 361 362 363
		ret = RX_HANDLER_ANOTHER;
		success = true;
	}

out:
	ipvlan_count_rx(ipvlan, len, success, false);
	return ret;
}

static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port,
					    void *lyr3h, int addr_type,
					    bool use_dest)
{
	struct ipvl_addr *addr = NULL;

M
Matteo Croce 已提交
364 365 366
	switch (addr_type) {
#if IS_ENABLED(CONFIG_IPV6)
	case IPVL_IPV6: {
367 368 369 370 371 372
		struct ipv6hdr *ip6h;
		struct in6_addr *i6addr;

		ip6h = (struct ipv6hdr *)lyr3h;
		i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr;
		addr = ipvlan_ht_addr_lookup(port, i6addr, true);
M
Matteo Croce 已提交
373 374 375
		break;
	}
	case IPVL_ICMPV6: {
376 377 378 379 380 381 382 383 384 385 386
		struct nd_msg *ndmh;
		struct in6_addr *i6addr;

		/* Make sure that the NeighborSolicitation ICMPv6 packets
		 * are handled to avoid DAD issue.
		 */
		ndmh = (struct nd_msg *)lyr3h;
		if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) {
			i6addr = &ndmh->target;
			addr = ipvlan_ht_addr_lookup(port, i6addr, true);
		}
M
Matteo Croce 已提交
387 388 389 390
		break;
	}
#endif
	case IPVL_IPV4: {
391 392 393 394 395 396
		struct iphdr *ip4h;
		__be32 *i4addr;

		ip4h = (struct iphdr *)lyr3h;
		i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr;
		addr = ipvlan_ht_addr_lookup(port, i4addr, false);
M
Matteo Croce 已提交
397 398 399
		break;
	}
	case IPVL_ARP: {
400 401 402 403 404 405 406 407 408 409 410 411 412
		struct arphdr *arph;
		unsigned char *arp_ptr;
		__be32 dip;

		arph = (struct arphdr *)lyr3h;
		arp_ptr = (unsigned char *)(arph + 1);
		if (use_dest)
			arp_ptr += (2 * port->dev->addr_len) + 4;
		else
			arp_ptr += port->dev->addr_len;

		memcpy(&dip, arp_ptr, 4);
		addr = ipvlan_ht_addr_lookup(port, &dip, false);
M
Matteo Croce 已提交
413 414
		break;
	}
415 416 417 418 419
	}

	return addr;
}

420
static int ipvlan_process_v4_outbound(struct sk_buff *skb)
421 422 423
{
	const struct iphdr *ip4h = ip_hdr(skb);
	struct net_device *dev = skb->dev;
424
	struct net *net = dev_net(dev);
425 426 427
	struct rtable *rt;
	int err, ret = NET_XMIT_DROP;
	struct flowi4 fl4 = {
428
		.flowi4_oif = dev->ifindex,
429 430
		.flowi4_tos = RT_TOS(ip4h->tos),
		.flowi4_flags = FLOWI_FLAG_ANYSRC,
431
		.flowi4_mark = skb->mark,
432 433 434 435
		.daddr = ip4h->daddr,
		.saddr = ip4h->saddr,
	};

436
	rt = ip_route_output_flow(net, &fl4, NULL);
437 438 439 440 441 442 443 444
	if (IS_ERR(rt))
		goto err;

	if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) {
		ip_rt_put(rt);
		goto err;
	}
	skb_dst_set(skb, &rt->dst);
445
	err = ip_local_out(net, skb->sk, skb);
446 447 448 449 450 451 452 453 454 455 456 457
	if (unlikely(net_xmit_eval(err)))
		dev->stats.tx_errors++;
	else
		ret = NET_XMIT_SUCCESS;
	goto out;
err:
	dev->stats.tx_errors++;
	kfree_skb(skb);
out:
	return ret;
}

M
Matteo Croce 已提交
458
#if IS_ENABLED(CONFIG_IPV6)
459
static int ipvlan_process_v6_outbound(struct sk_buff *skb)
460 461 462
{
	const struct ipv6hdr *ip6h = ipv6_hdr(skb);
	struct net_device *dev = skb->dev;
463
	struct net *net = dev_net(dev);
464 465 466
	struct dst_entry *dst;
	int err, ret = NET_XMIT_DROP;
	struct flowi6 fl6 = {
K
Keefe Liu 已提交
467
		.flowi6_oif = dev->ifindex,
468 469 470 471 472 473 474 475
		.daddr = ip6h->daddr,
		.saddr = ip6h->saddr,
		.flowi6_flags = FLOWI_FLAG_ANYSRC,
		.flowlabel = ip6_flowinfo(ip6h),
		.flowi6_mark = skb->mark,
		.flowi6_proto = ip6h->nexthdr,
	};

476
	dst = ip6_route_output(net, NULL, &fl6);
477 478 479
	if (dst->error) {
		ret = dst->error;
		dst_release(dst);
480
		goto err;
481
	}
482
	skb_dst_set(skb, dst);
483
	err = ip6_local_out(net, skb->sk, skb);
484 485 486 487 488 489 490 491 492 493 494
	if (unlikely(net_xmit_eval(err)))
		dev->stats.tx_errors++;
	else
		ret = NET_XMIT_SUCCESS;
	goto out;
err:
	dev->stats.tx_errors++;
	kfree_skb(skb);
out:
	return ret;
}
M
Matteo Croce 已提交
495 496 497 498 499 500
#else
static int ipvlan_process_v6_outbound(struct sk_buff *skb)
{
	return NET_XMIT_DROP;
}
#endif
501

502
static int ipvlan_process_outbound(struct sk_buff *skb)
503 504 505 506 507 508
{
	struct ethhdr *ethh = eth_hdr(skb);
	int ret = NET_XMIT_DROP;

	/* In this mode we dont care about multicast and broadcast traffic */
	if (is_multicast_ether_addr(ethh->h_dest)) {
509 510
		pr_debug_ratelimited("Dropped {multi|broad}cast of type=[%x]\n",
				     ntohs(skb->protocol));
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
		kfree_skb(skb);
		goto out;
	}

	/* The ipvlan is a pseudo-L2 device, so the packets that we receive
	 * will have L2; which need to discarded and processed further
	 * in the net-ns of the main-device.
	 */
	if (skb_mac_header_was_set(skb)) {
		skb_pull(skb, sizeof(*ethh));
		skb->mac_header = (typeof(skb->mac_header))~0U;
		skb_reset_network_header(skb);
	}

	if (skb->protocol == htons(ETH_P_IPV6))
526
		ret = ipvlan_process_v6_outbound(skb);
527
	else if (skb->protocol == htons(ETH_P_IP))
528
		ret = ipvlan_process_v4_outbound(skb);
529 530 531 532 533 534 535 536 537
	else {
		pr_warn_ratelimited("Dropped outbound packet type=%x\n",
				    ntohs(skb->protocol));
		kfree_skb(skb);
	}
out:
	return ret;
}

538
static void ipvlan_multicast_enqueue(struct ipvl_port *port,
M
Mahesh Bandewar 已提交
539
				     struct sk_buff *skb, bool tx_pkt)
540 541 542 543 544 545
{
	if (skb->protocol == htons(ETH_P_PAUSE)) {
		kfree_skb(skb);
		return;
	}

M
Mahesh Bandewar 已提交
546 547 548 549 550 551 552
	/* Record that the deferred packet is from TX or RX path. By
	 * looking at mac-addresses on packet will lead to erronus decisions.
	 * (This would be true for a loopback-mode on master device or a
	 * hair-pin mode of the switch.)
	 */
	IPVL_SKB_CB(skb)->tx_pkt = tx_pkt;

553 554
	spin_lock(&port->backlog.lock);
	if (skb_queue_len(&port->backlog) < IPVLAN_QBACKLOG_LIMIT) {
555 556
		if (skb->dev)
			dev_hold(skb->dev);
557 558 559 560 561 562 563 564 565 566
		__skb_queue_tail(&port->backlog, skb);
		spin_unlock(&port->backlog.lock);
		schedule_work(&port->wq);
	} else {
		spin_unlock(&port->backlog.lock);
		atomic_long_inc(&skb->dev->rx_dropped);
		kfree_skb(skb);
	}
}

567 568 569 570 571 572 573
static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev)
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);
	void *lyr3h;
	struct ipvl_addr *addr;
	int addr_type;

574
	lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type);
575 576 577
	if (!lyr3h)
		goto out;

M
Mahesh Bandewar 已提交
578 579 580 581 582 583 584 585
	if (!ipvlan_is_vepa(ipvlan->port)) {
		addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
		if (addr) {
			if (ipvlan_is_private(ipvlan->port)) {
				consume_skb(skb);
				return NET_XMIT_DROP;
			}
			return ipvlan_rcv_frame(addr, &skb, true);
586 587
		}
	}
588
out:
589 590
	ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev);
	return ipvlan_process_outbound(skb);
591 592 593 594 595 596 597 598 599 600
}

static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev)
{
	const struct ipvl_dev *ipvlan = netdev_priv(dev);
	struct ethhdr *eth = eth_hdr(skb);
	struct ipvl_addr *addr;
	void *lyr3h;
	int addr_type;

M
Mahesh Bandewar 已提交
601 602
	if (!ipvlan_is_vepa(ipvlan->port) &&
	    ether_addr_equal(eth->h_dest, eth->h_source)) {
603
		lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type);
604 605
		if (lyr3h) {
			addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true);
606 607 608 609 610
			if (addr) {
				if (ipvlan_is_private(ipvlan->port)) {
					consume_skb(skb);
					return NET_XMIT_DROP;
				}
611
				return ipvlan_rcv_frame(addr, &skb, true);
612
			}
613 614 615 616 617 618 619 620 621 622 623 624 625
		}
		skb = skb_share_check(skb, GFP_ATOMIC);
		if (!skb)
			return NET_XMIT_DROP;

		/* Packet definitely does not belong to any of the
		 * virtual devices, but the dest is local. So forward
		 * the skb for the main-dev. At the RX side we just return
		 * RX_PASS for it to be processed further on the stack.
		 */
		return dev_forward_skb(ipvlan->phy_dev, skb);

	} else if (is_multicast_ether_addr(eth->h_dest)) {
626
		ipvlan_skb_crossing_ns(skb, NULL);
M
Mahesh Bandewar 已提交
627
		ipvlan_multicast_enqueue(ipvlan->port, skb, true);
628
		return NET_XMIT_SUCCESS;
629 630
	}

631
	skb->dev = ipvlan->phy_dev;
632 633 634 635 636 637
	return dev_queue_xmit(skb);
}

int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev)
{
	struct ipvl_dev *ipvlan = netdev_priv(dev);
638
	struct ipvl_port *port = ipvlan_port_get_rcu_bh(ipvlan->phy_dev);
639 640 641 642 643 644 645 646 647 648 649

	if (!port)
		goto out;

	if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
		goto out;

	switch(port->mode) {
	case IPVLAN_MODE_L2:
		return ipvlan_xmit_mode_l2(skb, dev);
	case IPVLAN_MODE_L3:
M
Mahesh Bandewar 已提交
650
	case IPVLAN_MODE_L3S:
651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669
		return ipvlan_xmit_mode_l3(skb, dev);
	}

	/* Should not reach here */
	WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n",
			  port->mode);
out:
	kfree_skb(skb);
	return NET_XMIT_DROP;
}

static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port)
{
	struct ethhdr *eth = eth_hdr(skb);
	struct ipvl_addr *addr;
	void *lyr3h;
	int addr_type;

	if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) {
670
		lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type);
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690
		if (!lyr3h)
			return true;

		addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false);
		if (addr)
			return false;
	}

	return true;
}

static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb,
						 struct ipvl_port *port)
{
	void *lyr3h;
	int addr_type;
	struct ipvl_addr *addr;
	struct sk_buff *skb = *pskb;
	rx_handler_result_t ret = RX_HANDLER_PASS;

691
	lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type);
692 693 694 695 696
	if (!lyr3h)
		goto out;

	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
	if (addr)
697
		ret = ipvlan_rcv_frame(addr, pskb, false);
698 699 700 701 702 703 704 705 706 707 708 709 710

out:
	return ret;
}

static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb,
						 struct ipvl_port *port)
{
	struct sk_buff *skb = *pskb;
	struct ethhdr *eth = eth_hdr(skb);
	rx_handler_result_t ret = RX_HANDLER_PASS;

	if (is_multicast_ether_addr(eth->h_dest)) {
711 712 713 714 715 716 717 718 719
		if (ipvlan_external_frame(skb, port)) {
			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);

			/* External frames are queued for device local
			 * distribution, but a copy is given to master
			 * straight away to avoid sending duplicates later
			 * when work-queue processes this frame. This is
			 * achieved by returning RX_HANDLER_PASS.
			 */
720 721
			if (nskb) {
				ipvlan_skb_crossing_ns(nskb, NULL);
M
Mahesh Bandewar 已提交
722
				ipvlan_multicast_enqueue(port, nskb, false);
723
			}
724
		}
725
	} else {
726 727
		/* Perform like l3 mode for non-multicast packet */
		ret = ipvlan_handle_mode_l3(pskb, port);
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
	}

	return ret;
}

rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb)
{
	struct sk_buff *skb = *pskb;
	struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev);

	if (!port)
		return RX_HANDLER_PASS;

	switch (port->mode) {
	case IPVLAN_MODE_L2:
		return ipvlan_handle_mode_l2(pskb, port);
	case IPVLAN_MODE_L3:
		return ipvlan_handle_mode_l3(pskb, port);
M
Mahesh Bandewar 已提交
746 747
	case IPVLAN_MODE_L3S:
		return RX_HANDLER_PASS;
748 749 750 751 752 753
	}

	/* Should not reach here */
	WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n",
			  port->mode);
	kfree_skb(skb);
754
	return RX_HANDLER_CONSUMED;
755
}
M
Mahesh Bandewar 已提交
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771

static struct ipvl_addr *ipvlan_skb_to_addr(struct sk_buff *skb,
					    struct net_device *dev)
{
	struct ipvl_addr *addr = NULL;
	struct ipvl_port *port;
	void *lyr3h;
	int addr_type;

	if (!dev || !netif_is_ipvlan_port(dev))
		goto out;

	port = ipvlan_port_get_rcu(dev);
	if (!port || port->mode != IPVLAN_MODE_L3S)
		goto out;

772
	lyr3h = ipvlan_get_L3_hdr(port, skb, &addr_type);
M
Mahesh Bandewar 已提交
773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
	if (!lyr3h)
		goto out;

	addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true);
out:
	return addr;
}

struct sk_buff *ipvlan_l3_rcv(struct net_device *dev, struct sk_buff *skb,
			      u16 proto)
{
	struct ipvl_addr *addr;
	struct net_device *sdev;

	addr = ipvlan_skb_to_addr(skb, dev);
	if (!addr)
		goto out;

	sdev = addr->master->dev;
	switch (proto) {
	case AF_INET:
	{
		int err;
		struct iphdr *ip4h = ip_hdr(skb);

		err = ip_route_input_noref(skb, ip4h->daddr, ip4h->saddr,
					   ip4h->tos, sdev);
		if (unlikely(err))
			goto out;
		break;
	}
M
Matteo Croce 已提交
804
#if IS_ENABLED(CONFIG_IPV6)
M
Mahesh Bandewar 已提交
805 806 807 808 809 810 811 812 813 814 815 816 817 818 819
	case AF_INET6:
	{
		struct dst_entry *dst;
		struct ipv6hdr *ip6h = ipv6_hdr(skb);
		int flags = RT6_LOOKUP_F_HAS_SADDR;
		struct flowi6 fl6 = {
			.flowi6_iif   = sdev->ifindex,
			.daddr        = ip6h->daddr,
			.saddr        = ip6h->saddr,
			.flowlabel    = ip6_flowinfo(ip6h),
			.flowi6_mark  = skb->mark,
			.flowi6_proto = ip6h->nexthdr,
		};

		skb_dst_drop(skb);
D
David Ahern 已提交
820 821
		dst = ip6_route_input_lookup(dev_net(sdev), sdev, &fl6,
					     skb, flags);
M
Mahesh Bandewar 已提交
822 823 824
		skb_dst_set(skb, dst);
		break;
	}
M
Matteo Croce 已提交
825
#endif
M
Mahesh Bandewar 已提交
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
	default:
		break;
	}

out:
	return skb;
}

unsigned int ipvlan_nf_input(void *priv, struct sk_buff *skb,
			     const struct nf_hook_state *state)
{
	struct ipvl_addr *addr;
	unsigned int len;

	addr = ipvlan_skb_to_addr(skb, skb->dev);
	if (!addr)
		goto out;

	skb->dev = addr->master->dev;
	len = skb->len + ETH_HLEN;
	ipvlan_count_rx(addr->master, len, true, false);
out:
	return NF_ACCEPT;
}