flow_dissector.c 12.6 KB
Newer Older
E
Eric Dumazet 已提交
1
#include <linux/skbuff.h>
2
#include <linux/export.h>
E
Eric Dumazet 已提交
3 4 5 6
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/if_vlan.h>
#include <net/ip.h>
E
Eric Dumazet 已提交
7
#include <net/ipv6.h>
8 9 10 11
#include <linux/igmp.h>
#include <linux/icmp.h>
#include <linux/sctp.h>
#include <linux/dccp.h>
E
Eric Dumazet 已提交
12 13 14 15
#include <linux/if_tunnel.h>
#include <linux/if_pppox.h>
#include <linux/ppp_defs.h>
#include <net/flow_keys.h>
16
#include <scsi/fc/fc_fcoe.h>
E
Eric Dumazet 已提交
17

18 19 20 21 22 23 24 25 26 27
/* copy saddr & daddr, possibly using 64bit load/store
 * Equivalent to :	flow->src = iph->saddr;
 *			flow->dst = iph->daddr;
 */
static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
{
	BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
		     offsetof(typeof(*flow), src) + sizeof(flow->src));
	memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
}
E
Eric Dumazet 已提交
28

29
/**
30 31
 * __skb_flow_get_ports - extract the upper layer ports and return them
 * @skb: sk_buff to extract the ports from
32 33
 * @thoff: transport header offset
 * @ip_proto: protocol for which to get port offset
34 35
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
36 37 38 39
 *
 * The function will try to retrieve the ports at offset thoff + poff where poff
 * is the protocol port offset returned from proto_ports_offset
 */
40 41
__be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
			    void *data, int hlen)
42 43 44
{
	int poff = proto_ports_offset(ip_proto);

45 46 47 48 49
	if (!data) {
		data = skb->data;
		hlen = skb_headlen(skb);
	}

50 51 52
	if (poff >= 0) {
		__be32 *ports, _ports;

53 54
		ports = __skb_header_pointer(skb, thoff + poff,
					     sizeof(_ports), data, hlen, &_ports);
55 56 57 58 59 60
		if (ports)
			return *ports;
	}

	return 0;
}
61
EXPORT_SYMBOL(__skb_flow_get_ports);
62

63 64 65 66 67 68 69 70 71 72 73 74 75
/**
 * __skb_flow_dissect - extract the flow_keys struct and return it
 * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified
 * @data: raw buffer pointer to the packet, if NULL use skb->data
 * @proto: protocol for which to get the flow, if @data is NULL use skb->protocol
 * @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
 * @hlen: packet header length, if @data is NULL use skb_headlen(skb)
 *
 * The function will try to retrieve the struct flow_keys from either the skbuff
 * or a raw buffer specified by the rest parameters
 */
bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
			void *data, __be16 proto, int nhoff, int hlen)
E
Eric Dumazet 已提交
76 77 78
{
	u8 ip_proto;

79 80
	if (!data) {
		data = skb->data;
81 82
		proto = skb->protocol;
		nhoff = skb_network_offset(skb);
83 84 85
		hlen = skb_headlen(skb);
	}

E
Eric Dumazet 已提交
86 87 88 89
	memset(flow, 0, sizeof(*flow));

again:
	switch (proto) {
90
	case htons(ETH_P_IP): {
E
Eric Dumazet 已提交
91 92 93
		const struct iphdr *iph;
		struct iphdr _iph;
ip:
94
		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
95
		if (!iph || iph->ihl < 5)
E
Eric Dumazet 已提交
96
			return false;
97
		nhoff += iph->ihl * 4;
E
Eric Dumazet 已提交
98

99
		ip_proto = iph->protocol;
E
Eric Dumazet 已提交
100 101
		if (ip_is_fragment(iph))
			ip_proto = 0;
102

103 104 105 106 107 108 109
		/* skip the address processing if skb is NULL.  The assumption
		 * here is that if there is no skb we are not looking for flow
		 * info but lengths and protocols.
		 */
		if (!skb)
			break;

110
		iph_to_flow_copy_addrs(flow, iph);
E
Eric Dumazet 已提交
111 112
		break;
	}
113
	case htons(ETH_P_IPV6): {
E
Eric Dumazet 已提交
114 115
		const struct ipv6hdr *iph;
		struct ipv6hdr _iph;
116 117
		__be32 flow_label;

E
Eric Dumazet 已提交
118
ipv6:
119
		iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
E
Eric Dumazet 已提交
120 121 122 123 124
		if (!iph)
			return false;

		ip_proto = iph->nexthdr;
		nhoff += sizeof(struct ipv6hdr);
125

126
		/* see comment above in IPv4 section */
127 128 129
		if (!skb)
			break;

130 131 132
		flow->src = (__force __be32)ipv6_addr_hash(&iph->saddr);
		flow->dst = (__force __be32)ipv6_addr_hash(&iph->daddr);

133 134 135 136 137 138 139 140 141 142 143 144 145 146
		flow_label = ip6_flowlabel(iph);
		if (flow_label) {
			/* Awesome, IPv6 packet has a flow label so we can
			 * use that to represent the ports without any
			 * further dissection.
			 */
			flow->n_proto = proto;
			flow->ip_proto = ip_proto;
			flow->ports = flow_label;
			flow->thoff = (u16)nhoff;

			return true;
		}

E
Eric Dumazet 已提交
147 148
		break;
	}
149 150
	case htons(ETH_P_8021AD):
	case htons(ETH_P_8021Q): {
E
Eric Dumazet 已提交
151 152 153
		const struct vlan_hdr *vlan;
		struct vlan_hdr _vlan;

154
		vlan = __skb_header_pointer(skb, nhoff, sizeof(_vlan), data, hlen, &_vlan);
E
Eric Dumazet 已提交
155 156 157 158 159 160 161
		if (!vlan)
			return false;

		proto = vlan->h_vlan_encapsulated_proto;
		nhoff += sizeof(*vlan);
		goto again;
	}
162
	case htons(ETH_P_PPP_SES): {
E
Eric Dumazet 已提交
163 164 165 166
		struct {
			struct pppoe_hdr hdr;
			__be16 proto;
		} *hdr, _hdr;
167
		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
E
Eric Dumazet 已提交
168 169 170 171 172
		if (!hdr)
			return false;
		proto = hdr->proto;
		nhoff += PPPOE_SES_HLEN;
		switch (proto) {
173
		case htons(PPP_IP):
E
Eric Dumazet 已提交
174
			goto ip;
175
		case htons(PPP_IPV6):
E
Eric Dumazet 已提交
176 177 178 179 180
			goto ipv6;
		default:
			return false;
		}
	}
E
Erik Hugne 已提交
181 182 183 184 185 186 187 188 189 190 191 192 193 194
	case htons(ETH_P_TIPC): {
		struct {
			__be32 pre[3];
			__be32 srcnode;
		} *hdr, _hdr;
		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
		if (!hdr)
			return false;
		flow->src = hdr->srcnode;
		flow->dst = 0;
		flow->n_proto = proto;
		flow->thoff = (u16)nhoff;
		return true;
	}
195 196 197
	case htons(ETH_P_FCOE):
		flow->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
		/* fall through */
E
Eric Dumazet 已提交
198 199 200 201 202 203 204 205 206 207 208
	default:
		return false;
	}

	switch (ip_proto) {
	case IPPROTO_GRE: {
		struct gre_hdr {
			__be16 flags;
			__be16 proto;
		} *hdr, _hdr;

209
		hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr);
E
Eric Dumazet 已提交
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
		if (!hdr)
			return false;
		/*
		 * Only look inside GRE if version zero and no
		 * routing
		 */
		if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
			proto = hdr->proto;
			nhoff += 4;
			if (hdr->flags & GRE_CSUM)
				nhoff += 4;
			if (hdr->flags & GRE_KEY)
				nhoff += 4;
			if (hdr->flags & GRE_SEQ)
				nhoff += 4;
M
Michael Dalton 已提交
225 226 227 228
			if (proto == htons(ETH_P_TEB)) {
				const struct ethhdr *eth;
				struct ethhdr _eth;

229 230 231
				eth = __skb_header_pointer(skb, nhoff,
							   sizeof(_eth),
							   data, hlen, &_eth);
M
Michael Dalton 已提交
232 233 234 235 236
				if (!eth)
					return false;
				proto = eth->h_proto;
				nhoff += sizeof(*eth);
			}
E
Eric Dumazet 已提交
237 238 239 240 241
			goto again;
		}
		break;
	}
	case IPPROTO_IPIP:
T
Tom Herbert 已提交
242 243
		proto = htons(ETH_P_IP);
		goto ip;
244 245 246
	case IPPROTO_IPV6:
		proto = htons(ETH_P_IPV6);
		goto ipv6;
E
Eric Dumazet 已提交
247 248 249 250
	default:
		break;
	}

251
	flow->n_proto = proto;
E
Eric Dumazet 已提交
252
	flow->ip_proto = ip_proto;
253 254
	flow->thoff = (u16) nhoff;

255 256 257 258 259
	/* unless skb is set we don't need to record port info */
	if (skb)
		flow->ports = __skb_flow_get_ports(skb, nhoff, ip_proto,
						   data, hlen);

E
Eric Dumazet 已提交
260 261
	return true;
}
262
EXPORT_SYMBOL(__skb_flow_dissect);
263 264

static u32 hashrnd __read_mostly;
265 266 267 268 269
static __always_inline void __flow_hash_secret_init(void)
{
	net_get_random_once(&hashrnd, sizeof(hashrnd));
}

T
Tom Herbert 已提交
270
static __always_inline u32 __flow_hash_3words(u32 a, u32 b, u32 c, u32 keyval)
271
{
T
Tom Herbert 已提交
272
	return jhash_3words(a, b, c, keyval);
273 274
}

T
Tom Herbert 已提交
275
static inline u32 __flow_hash_from_keys(struct flow_keys *keys, u32 keyval)
276 277 278 279 280 281 282 283 284 285 286 287 288
{
	u32 hash;

	/* get a consistent hash (same value on both flow directions) */
	if (((__force u32)keys->dst < (__force u32)keys->src) ||
	    (((__force u32)keys->dst == (__force u32)keys->src) &&
	     ((__force u16)keys->port16[1] < (__force u16)keys->port16[0]))) {
		swap(keys->dst, keys->src);
		swap(keys->port16[0], keys->port16[1]);
	}

	hash = __flow_hash_3words((__force u32)keys->dst,
				  (__force u32)keys->src,
T
Tom Herbert 已提交
289 290
				  (__force u32)keys->ports,
				  keyval);
291 292 293 294 295 296 297 298
	if (!hash)
		hash = 1;

	return hash;
}

u32 flow_hash_from_keys(struct flow_keys *keys)
{
T
Tom Herbert 已提交
299 300
	__flow_hash_secret_init();
	return __flow_hash_from_keys(keys, hashrnd);
301 302 303
}
EXPORT_SYMBOL(flow_hash_from_keys);

T
Tom Herbert 已提交
304 305 306 307 308 309 310 311 312
static inline u32 ___skb_get_hash(const struct sk_buff *skb,
				  struct flow_keys *keys, u32 keyval)
{
	if (!skb_flow_dissect(skb, keys))
		return 0;

	return __flow_hash_from_keys(keys, keyval);
}

T
Tom Herbert 已提交
313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339
struct _flow_keys_digest_data {
	__be16	n_proto;
	u8	ip_proto;
	u8	padding;
	__be32	ports;
	__be32	src;
	__be32	dst;
};

void make_flow_keys_digest(struct flow_keys_digest *digest,
			   const struct flow_keys *flow)
{
	struct _flow_keys_digest_data *data =
	    (struct _flow_keys_digest_data *)digest;

	BUILD_BUG_ON(sizeof(*data) > sizeof(*digest));

	memset(digest, 0, sizeof(*digest));

	data->n_proto = flow->n_proto;
	data->ip_proto = flow->ip_proto;
	data->ports = flow->ports;
	data->src = flow->src;
	data->dst = flow->dst;
}
EXPORT_SYMBOL(make_flow_keys_digest);

340
/*
341
 * __skb_get_hash: calculate a flow hash based on src/dst addresses
342 343
 * and src/dst port numbers.  Sets hash in skb to non-zero hash value
 * on success, zero indicates no valid hash.  Also, sets l4_hash in skb
344 345
 * if hash is a canonical 4-tuple hash over transport ports.
 */
346
void __skb_get_hash(struct sk_buff *skb)
347 348
{
	struct flow_keys keys;
T
Tom Herbert 已提交
349
	u32 hash;
350

T
Tom Herbert 已提交
351 352 353 354
	__flow_hash_secret_init();

	hash = ___skb_get_hash(skb, &keys, hashrnd);
	if (!hash)
355 356 357
		return;

	if (keys.ports)
358
		skb->l4_hash = 1;
359

360 361
	skb->sw_hash = 1;

T
Tom Herbert 已提交
362
	skb->hash = hash;
363
}
364
EXPORT_SYMBOL(__skb_get_hash);
365

T
Tom Herbert 已提交
366 367 368 369 370 371 372 373
__u32 skb_get_hash_perturb(const struct sk_buff *skb, u32 perturb)
{
	struct flow_keys keys;

	return ___skb_get_hash(skb, &keys, perturb);
}
EXPORT_SYMBOL(skb_get_hash_perturb);

374 375 376 377
/*
 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
 * to be used as a distribution range.
 */
378
u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397
		  unsigned int num_tx_queues)
{
	u32 hash;
	u16 qoffset = 0;
	u16 qcount = num_tx_queues;

	if (skb_rx_queue_recorded(skb)) {
		hash = skb_get_rx_queue(skb);
		while (unlikely(hash >= num_tx_queues))
			hash -= num_tx_queues;
		return hash;
	}

	if (dev->num_tc) {
		u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
		qoffset = dev->tc_to_txq[tc].offset;
		qcount = dev->tc_to_txq[tc].count;
	}

398
	return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
399 400 401
}
EXPORT_SYMBOL(__skb_tx_hash);

402 403
u32 __skb_get_poff(const struct sk_buff *skb, void *data,
		   const struct flow_keys *keys, int hlen)
404
{
405
	u32 poff = keys->thoff;
406

407
	switch (keys->ip_proto) {
408
	case IPPROTO_TCP: {
409 410 411
		/* access doff as u8 to avoid unaligned access */
		const u8 *doff;
		u8 _doff;
412

413 414 415
		doff = __skb_header_pointer(skb, poff + 12, sizeof(_doff),
					    data, hlen, &_doff);
		if (!doff)
416 417
			return poff;

418
		poff += max_t(u32, sizeof(struct tcphdr), (*doff & 0xF0) >> 2);
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
		break;
	}
	case IPPROTO_UDP:
	case IPPROTO_UDPLITE:
		poff += sizeof(struct udphdr);
		break;
	/* For the rest, we do not really care about header
	 * extensions at this point for now.
	 */
	case IPPROTO_ICMP:
		poff += sizeof(struct icmphdr);
		break;
	case IPPROTO_ICMPV6:
		poff += sizeof(struct icmp6hdr);
		break;
	case IPPROTO_IGMP:
		poff += sizeof(struct igmphdr);
		break;
	case IPPROTO_DCCP:
		poff += sizeof(struct dccp_hdr);
		break;
	case IPPROTO_SCTP:
		poff += sizeof(struct sctphdr);
		break;
	}

	return poff;
}

448 449 450 451 452 453 454 455 456 457 458 459 460 461 462
/* skb_get_poff() returns the offset to the payload as far as it could
 * be dissected. The main user is currently BPF, so that we can dynamically
 * truncate packets without needing to push actual payload to the user
 * space and can analyze headers only, instead.
 */
u32 skb_get_poff(const struct sk_buff *skb)
{
	struct flow_keys keys;

	if (!skb_flow_dissect(skb, &keys))
		return 0;

	return __skb_get_poff(skb, skb->data, &keys, skb_headlen(skb));
}

463 464 465 466 467 468 469 470 471 472 473
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
	struct xps_dev_maps *dev_maps;
	struct xps_map *map;
	int queue_index = -1;

	rcu_read_lock();
	dev_maps = rcu_dereference(dev->xps_maps);
	if (dev_maps) {
		map = rcu_dereference(
E
Eric Dumazet 已提交
474
		    dev_maps->cpu_map[skb->sender_cpu - 1]);
475 476 477
		if (map) {
			if (map->len == 1)
				queue_index = map->queues[0];
478
			else
479 480
				queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
									   map->len)];
481 482 483 484 485 486 487 488 489 490 491 492
			if (unlikely(queue_index >= dev->real_num_tx_queues))
				queue_index = -1;
		}
	}
	rcu_read_unlock();

	return queue_index;
#else
	return -1;
#endif
}

493
static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
494 495 496 497 498 499 500 501 502 503
{
	struct sock *sk = skb->sk;
	int queue_index = sk_tx_queue_get(sk);

	if (queue_index < 0 || skb->ooo_okay ||
	    queue_index >= dev->real_num_tx_queues) {
		int new_index = get_xps_queue(dev, skb);
		if (new_index < 0)
			new_index = skb_tx_hash(dev, skb);

504 505
		if (queue_index != new_index && sk &&
		    rcu_access_pointer(sk->sk_dst_cache))
E
Eric Dumazet 已提交
506
			sk_tx_queue_set(sk, new_index);
507 508 509 510 511 512 513 514

		queue_index = new_index;
	}

	return queue_index;
}

struct netdev_queue *netdev_pick_tx(struct net_device *dev,
515 516
				    struct sk_buff *skb,
				    void *accel_priv)
517 518 519
{
	int queue_index = 0;

E
Eric Dumazet 已提交
520 521 522 523 524
#ifdef CONFIG_XPS
	if (skb->sender_cpu == 0)
		skb->sender_cpu = raw_smp_processor_id() + 1;
#endif

525 526 527
	if (dev->real_num_tx_queues != 1) {
		const struct net_device_ops *ops = dev->netdev_ops;
		if (ops->ndo_select_queue)
528 529
			queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
							    __netdev_pick_tx);
530 531
		else
			queue_index = __netdev_pick_tx(dev, skb);
532 533

		if (!accel_priv)
534
			queue_index = netdev_cap_txqueue(dev, queue_index);
535 536 537 538 539
	}

	skb_set_queue_mapping(skb, queue_index);
	return netdev_get_tx_queue(dev, queue_index);
}