actions.c 34.1 KB
Newer Older
1
/*
2
 * Copyright (c) 2007-2017 Nicira, Inc.
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/skbuff.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/openvswitch.h>
J
Joe Stringer 已提交
25
#include <linux/netfilter_ipv6.h>
J
Joe Stringer 已提交
26
#include <linux/sctp.h>
27 28 29 30 31
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/in6.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
32

J
Joe Stringer 已提交
33
#include <net/dst.h>
34
#include <net/ip.h>
A
Ansis Atteka 已提交
35
#include <net/ipv6.h>
J
Joe Stringer 已提交
36
#include <net/ip6_fib.h>
37 38
#include <net/checksum.h>
#include <net/dsfield.h>
39
#include <net/mpls.h>
J
Joe Stringer 已提交
40
#include <net/sctp/checksum.h>
41 42

#include "datapath.h"
43
#include "flow.h"
J
Joe Stringer 已提交
44
#include "conntrack.h"
45 46
#include "vport.h"

47 48 49
struct deferred_action {
	struct sk_buff *skb;
	const struct nlattr *actions;
50
	int actions_len;
51 52 53 54 55

	/* Store pkt_key clone when creating deferred action. */
	struct sw_flow_key pkt_key;
};

J
Joe Stringer 已提交
56 57 58 59 60 61
#define MAX_L2_LEN	(VLAN_ETH_HLEN + 3 * MPLS_HLEN)
struct ovs_frag_data {
	unsigned long dst;
	struct vport *vport;
	struct ovs_skb_cb cb;
	__be16 inner_protocol;
62 63
	u16 network_offset;	/* valid only for MPLS */
	u16 vlan_tci;
J
Joe Stringer 已提交
64 65
	__be16 vlan_proto;
	unsigned int l2_len;
66
	u8 mac_proto;
J
Joe Stringer 已提交
67 68 69 70 71
	u8 l2_data[MAX_L2_LEN];
};

static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);

72
#define DEFERRED_ACTION_FIFO_SIZE 10
73 74
#define OVS_RECURSION_LIMIT 5
#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
75 76 77 78 79 80 81
struct action_fifo {
	int head;
	int tail;
	/* Deferred action fifo queue storage. */
	struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
};

82
struct action_flow_keys {
83 84 85
	struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
};

86
static struct action_fifo __percpu *action_fifos;
87
static struct action_flow_keys __percpu *flow_keys;
88 89
static DEFINE_PER_CPU(int, exec_actions_level);

90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106
/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys'
 * space. Return NULL if out of key spaces.
 */
static struct sw_flow_key *clone_key(const struct sw_flow_key *key_)
{
	struct action_flow_keys *keys = this_cpu_ptr(flow_keys);
	int level = this_cpu_read(exec_actions_level);
	struct sw_flow_key *key = NULL;

	if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
		key = &keys->key[level - 1];
		*key = *key_;
	}

	return key;
}

107 108 109 110 111 112
static void action_fifo_init(struct action_fifo *fifo)
{
	fifo->head = 0;
	fifo->tail = 0;
}

113
static bool action_fifo_is_empty(const struct action_fifo *fifo)
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
{
	return (fifo->head == fifo->tail);
}

static struct deferred_action *action_fifo_get(struct action_fifo *fifo)
{
	if (action_fifo_is_empty(fifo))
		return NULL;

	return &fifo->fifo[fifo->tail++];
}

static struct deferred_action *action_fifo_put(struct action_fifo *fifo)
{
	if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1)
		return NULL;

	return &fifo->fifo[fifo->head++];
}

/* Return true if fifo is not full */
static struct deferred_action *add_deferred_actions(struct sk_buff *skb,
136 137 138
				    const struct sw_flow_key *key,
				    const struct nlattr *actions,
				    const int actions_len)
139 140 141 142 143 144 145 146
{
	struct action_fifo *fifo;
	struct deferred_action *da;

	fifo = this_cpu_ptr(action_fifos);
	da = action_fifo_put(fifo);
	if (da) {
		da->skb = skb;
147 148
		da->actions = actions;
		da->actions_len = actions_len;
149 150 151 152 153 154
		da->pkt_key = *key;
	}

	return da;
}

155 156
static void invalidate_flow_key(struct sw_flow_key *key)
{
157
	key->mac_proto |= SW_FLOW_KEY_INVALID;
158 159 160 161
}

static bool is_flow_key_valid(const struct sw_flow_key *key)
{
162
	return !(key->mac_proto & SW_FLOW_KEY_INVALID);
163 164
}

165 166 167 168 169 170
static int clone_execute(struct datapath *dp, struct sk_buff *skb,
			 struct sw_flow_key *key,
			 u32 recirc_id,
			 const struct nlattr *actions, int len,
			 bool last, bool clone_flow_key);

171 172 173 174 175 176 177 178 179 180 181 182 183
static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr,
			     __be16 ethertype)
{
	if (skb->ip_summed == CHECKSUM_COMPLETE) {
		__be16 diff[] = { ~(hdr->h_proto), ethertype };

		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
					~skb->csum);
	}

	hdr->h_proto = ethertype;
}

184
static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
185 186
		     const struct ovs_action_push_mpls *mpls)
{
J
Jiri Benc 已提交
187
	struct mpls_shim_hdr *new_mpls_lse;
188 189 190 191 192 193 194 195

	/* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
	if (skb->encapsulation)
		return -ENOTSUPP;

	if (skb_cow_head(skb, MPLS_HLEN) < 0)
		return -ENOMEM;

D
David Ahern 已提交
196 197 198 199 200
	if (!skb->inner_protocol) {
		skb_set_inner_network_header(skb, skb->mac_len);
		skb_set_inner_protocol(skb, skb->protocol);
	}

201 202 203 204
	skb_push(skb, MPLS_HLEN);
	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
		skb->mac_len);
	skb_reset_mac_header(skb);
D
David Ahern 已提交
205
	skb_set_network_header(skb, skb->mac_len);
206

J
Jiri Benc 已提交
207 208
	new_mpls_lse = mpls_hdr(skb);
	new_mpls_lse->label_stack_entry = mpls->mpls_lse;
209

210
	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
211

212 213
	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET)
		update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
214 215
	skb->protocol = mpls->mpls_ethertype;

216
	invalidate_flow_key(key);
217 218 219
	return 0;
}

220 221
static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
		    const __be16 ethertype)
222 223 224
{
	int err;

225
	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
226 227 228
	if (unlikely(err))
		return err;

J
Jiri Benc 已提交
229
	skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
230 231 232 233 234 235

	memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
		skb->mac_len);

	__skb_pull(skb, MPLS_HLEN);
	skb_reset_mac_header(skb);
D
David Ahern 已提交
236
	skb_set_network_header(skb, skb->mac_len);
237

238 239 240 241 242 243 244 245 246
	if (ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET) {
		struct ethhdr *hdr;

		/* mpls_hdr() is used to locate the ethertype field correctly in the
		 * presence of VLAN tags.
		 */
		hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
		update_ethertype(skb, hdr, ethertype);
	}
247 248
	if (eth_p_mpls(skb->protocol))
		skb->protocol = ethertype;
249 250

	invalidate_flow_key(key);
251 252 253
	return 0;
}

254 255
static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key,
		    const __be32 *mpls_lse, const __be32 *mask)
256
{
J
Jiri Benc 已提交
257
	struct mpls_shim_hdr *stack;
258
	__be32 lse;
259 260
	int err;

261
	err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
262 263 264
	if (unlikely(err))
		return err;

J
Jiri Benc 已提交
265 266
	stack = mpls_hdr(skb);
	lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask);
267
	if (skb->ip_summed == CHECKSUM_COMPLETE) {
J
Jiri Benc 已提交
268
		__be32 diff[] = { ~(stack->label_stack_entry), lse };
269

270 271 272 273
		skb->csum = ~csum_partial((char *)diff, sizeof(diff),
					  ~skb->csum);
	}

J
Jiri Benc 已提交
274
	stack->label_stack_entry = lse;
275
	flow_key->mpls.top_lse = lse;
276 277 278
	return 0;
}

279
static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key)
280 281 282
{
	int err;

283
	err = skb_vlan_pop(skb);
284
	if (skb_vlan_tag_present(skb)) {
285
		invalidate_flow_key(key);
286 287 288 289
	} else {
		key->eth.vlan.tci = 0;
		key->eth.vlan.tpid = 0;
	}
290
	return err;
291 292
}

293 294
static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key,
		     const struct ovs_action_push_vlan *vlan)
295
{
296
	if (skb_vlan_tag_present(skb)) {
297
		invalidate_flow_key(key);
298 299 300 301
	} else {
		key->eth.vlan.tci = vlan->vlan_tci;
		key->eth.vlan.tpid = vlan->vlan_tpid;
	}
302 303
	return skb_vlan_push(skb, vlan->vlan_tpid,
			     ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
304 305
}

306 307 308 309 310 311 312
/* 'src' is already properly masked. */
static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_)
{
	u16 *dst = (u16 *)dst_;
	const u16 *src = (const u16 *)src_;
	const u16 *mask = (const u16 *)mask_;

313 314 315
	OVS_SET_MASKED(dst[0], src[0], mask[0]);
	OVS_SET_MASKED(dst[1], src[1], mask[1]);
	OVS_SET_MASKED(dst[2], src[2], mask[2]);
316 317 318 319 320
}

static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
			const struct ovs_key_ethernet *key,
			const struct ovs_key_ethernet *mask)
321 322
{
	int err;
323

324
	err = skb_ensure_writable(skb, ETH_HLEN);
325 326 327
	if (unlikely(err))
		return err;

328 329
	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);

330 331 332 333
	ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src,
			       mask->eth_src);
	ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
			       mask->eth_dst);
334

335
	skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
336

337 338
	ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
	ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
339 340 341
	return 0;
}

342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
/* pop_eth does not support VLAN packets as this action is never called
 * for them.
 */
static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key)
{
	skb_pull_rcsum(skb, ETH_HLEN);
	skb_reset_mac_header(skb);
	skb_reset_mac_len(skb);

	/* safe right before invalidate_flow_key */
	key->mac_proto = MAC_PROTO_NONE;
	invalidate_flow_key(key);
	return 0;
}

static int push_eth(struct sk_buff *skb, struct sw_flow_key *key,
		    const struct ovs_action_push_eth *ethh)
{
	struct ethhdr *hdr;

	/* Add the new Ethernet header */
	if (skb_cow_head(skb, ETH_HLEN) < 0)
		return -ENOMEM;

	skb_push(skb, ETH_HLEN);
	skb_reset_mac_header(skb);
	skb_reset_mac_len(skb);

	hdr = eth_hdr(skb);
	ether_addr_copy(hdr->h_source, ethh->addresses.eth_src);
	ether_addr_copy(hdr->h_dest, ethh->addresses.eth_dst);
	hdr->h_proto = skb->protocol;

	skb_postpush_rcsum(skb, hdr, ETH_HLEN);

	/* safe right before invalidate_flow_key */
	key->mac_proto = MAC_PROTO_ETHERNET;
	invalidate_flow_key(key);
	return 0;
}

383 384
static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh,
				  __be32 addr, __be32 new_addr)
385 386 387
{
	int transport_len = skb->len - skb_transport_offset(skb);

388 389 390
	if (nh->frag_off & htons(IP_OFFSET))
		return;

391 392 393
	if (nh->protocol == IPPROTO_TCP) {
		if (likely(transport_len >= sizeof(struct tcphdr)))
			inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb,
394
						 addr, new_addr, true);
395
	} else if (nh->protocol == IPPROTO_UDP) {
396 397 398 399 400
		if (likely(transport_len >= sizeof(struct udphdr))) {
			struct udphdr *uh = udp_hdr(skb);

			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
				inet_proto_csum_replace4(&uh->check, skb,
401
							 addr, new_addr, true);
402 403 404 405
				if (!uh->check)
					uh->check = CSUM_MANGLED_0;
			}
		}
406
	}
407
}
408

409 410 411 412
static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh,
			__be32 *addr, __be32 new_addr)
{
	update_ip_l4_checksum(skb, nh, *addr, new_addr);
413
	csum_replace4(&nh->check, *addr, new_addr);
414
	skb_clear_hash(skb);
415 416 417
	*addr = new_addr;
}

A
Ansis Atteka 已提交
418 419 420 421 422
static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto,
				 __be32 addr[4], const __be32 new_addr[4])
{
	int transport_len = skb->len - skb_transport_offset(skb);

423
	if (l4_proto == NEXTHDR_TCP) {
A
Ansis Atteka 已提交
424 425
		if (likely(transport_len >= sizeof(struct tcphdr)))
			inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb,
426
						  addr, new_addr, true);
427
	} else if (l4_proto == NEXTHDR_UDP) {
A
Ansis Atteka 已提交
428 429 430 431 432
		if (likely(transport_len >= sizeof(struct udphdr))) {
			struct udphdr *uh = udp_hdr(skb);

			if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
				inet_proto_csum_replace16(&uh->check, skb,
433
							  addr, new_addr, true);
A
Ansis Atteka 已提交
434 435 436 437
				if (!uh->check)
					uh->check = CSUM_MANGLED_0;
			}
		}
438 439 440
	} else if (l4_proto == NEXTHDR_ICMP) {
		if (likely(transport_len >= sizeof(struct icmp6hdr)))
			inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum,
441
						  skb, addr, new_addr, true);
A
Ansis Atteka 已提交
442 443 444
	}
}

445 446 447
static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4],
			   const __be32 mask[4], __be32 masked[4])
{
448 449 450 451
	masked[0] = OVS_MASKED(old[0], addr[0], mask[0]);
	masked[1] = OVS_MASKED(old[1], addr[1], mask[1]);
	masked[2] = OVS_MASKED(old[2], addr[2], mask[2]);
	masked[3] = OVS_MASKED(old[3], addr[3], mask[3]);
452 453
}

A
Ansis Atteka 已提交
454 455 456 457 458 459 460
static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto,
			  __be32 addr[4], const __be32 new_addr[4],
			  bool recalculate_csum)
{
	if (recalculate_csum)
		update_ipv6_checksum(skb, l4_proto, addr, new_addr);

461
	skb_clear_hash(skb);
A
Ansis Atteka 已提交
462 463 464
	memcpy(addr, new_addr, sizeof(__be32[4]));
}

465
static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl, u32 mask)
A
Ansis Atteka 已提交
466
{
467
	/* Bits 21-24 are always unmasked, so this retains their values. */
468 469 470
	OVS_SET_MASKED(nh->flow_lbl[0], (u8)(fl >> 16), (u8)(mask >> 16));
	OVS_SET_MASKED(nh->flow_lbl[1], (u8)(fl >> 8), (u8)(mask >> 8));
	OVS_SET_MASKED(nh->flow_lbl[2], (u8)fl, (u8)mask);
A
Ansis Atteka 已提交
471 472
}

473 474
static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl,
		       u8 mask)
A
Ansis Atteka 已提交
475
{
476
	new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask);
A
Ansis Atteka 已提交
477

478 479 480 481
	csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8));
	nh->ttl = new_ttl;
}

482 483 484
static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key,
		    const struct ovs_key_ipv4 *key,
		    const struct ovs_key_ipv4 *mask)
485 486
{
	struct iphdr *nh;
487
	__be32 new_addr;
488 489
	int err;

490 491
	err = skb_ensure_writable(skb, skb_network_offset(skb) +
				  sizeof(struct iphdr));
492 493 494 495 496
	if (unlikely(err))
		return err;

	nh = ip_hdr(skb);

497 498 499 500 501
	/* Setting an IP addresses is typically only a side effect of
	 * matching on them in the current userspace implementation, so it
	 * makes sense to check if the value actually changed.
	 */
	if (mask->ipv4_src) {
502
		new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src);
503

504 505 506 507
		if (unlikely(new_addr != nh->saddr)) {
			set_ip_addr(skb, nh, &nh->saddr, new_addr);
			flow_key->ipv4.addr.src = new_addr;
		}
508
	}
509
	if (mask->ipv4_dst) {
510
		new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst);
511

512 513 514 515
		if (unlikely(new_addr != nh->daddr)) {
			set_ip_addr(skb, nh, &nh->daddr, new_addr);
			flow_key->ipv4.addr.dst = new_addr;
		}
516
	}
517 518 519 520 521 522 523
	if (mask->ipv4_tos) {
		ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos);
		flow_key->ip.tos = nh->tos;
	}
	if (mask->ipv4_ttl) {
		set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl);
		flow_key->ip.ttl = nh->ttl;
524
	}
525 526 527 528

	return 0;
}

529 530 531 532 533 534 535 536
static bool is_ipv6_mask_nonzero(const __be32 addr[4])
{
	return !!(addr[0] | addr[1] | addr[2] | addr[3]);
}

static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key,
		    const struct ovs_key_ipv6 *key,
		    const struct ovs_key_ipv6 *mask)
A
Ansis Atteka 已提交
537 538 539 540
{
	struct ipv6hdr *nh;
	int err;

541 542
	err = skb_ensure_writable(skb, skb_network_offset(skb) +
				  sizeof(struct ipv6hdr));
A
Ansis Atteka 已提交
543 544 545 546 547
	if (unlikely(err))
		return err;

	nh = ipv6_hdr(skb);

548 549 550 551 552 553 554 555 556 557 558
	/* Setting an IP addresses is typically only a side effect of
	 * matching on them in the current userspace implementation, so it
	 * makes sense to check if the value actually changed.
	 */
	if (is_ipv6_mask_nonzero(mask->ipv6_src)) {
		__be32 *saddr = (__be32 *)&nh->saddr;
		__be32 masked[4];

		mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked);

		if (unlikely(memcmp(saddr, masked, sizeof(masked)))) {
559
			set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked,
560 561 562 563 564 565
				      true);
			memcpy(&flow_key->ipv6.addr.src, masked,
			       sizeof(flow_key->ipv6.addr.src));
		}
	}
	if (is_ipv6_mask_nonzero(mask->ipv6_dst)) {
A
Ansis Atteka 已提交
566 567 568
		unsigned int offset = 0;
		int flags = IP6_FH_F_SKIP_RH;
		bool recalc_csum = true;
569 570 571 572 573 574 575 576 577 578 579 580
		__be32 *daddr = (__be32 *)&nh->daddr;
		__be32 masked[4];

		mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked);

		if (unlikely(memcmp(daddr, masked, sizeof(masked)))) {
			if (ipv6_ext_hdr(nh->nexthdr))
				recalc_csum = (ipv6_find_hdr(skb, &offset,
							     NEXTHDR_ROUTING,
							     NULL, &flags)
					       != NEXTHDR_ROUTING);

581
			set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked,
582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597
				      recalc_csum);
			memcpy(&flow_key->ipv6.addr.dst, masked,
			       sizeof(flow_key->ipv6.addr.dst));
		}
	}
	if (mask->ipv6_tclass) {
		ipv6_change_dsfield(nh, ~mask->ipv6_tclass, key->ipv6_tclass);
		flow_key->ip.tos = ipv6_get_dsfield(nh);
	}
	if (mask->ipv6_label) {
		set_ipv6_fl(nh, ntohl(key->ipv6_label),
			    ntohl(mask->ipv6_label));
		flow_key->ipv6.label =
		    *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL);
	}
	if (mask->ipv6_hlimit) {
598 599
		OVS_SET_MASKED(nh->hop_limit, key->ipv6_hlimit,
			       mask->ipv6_hlimit);
600
		flow_key->ip.ttl = nh->hop_limit;
A
Ansis Atteka 已提交
601 602 603 604
	}
	return 0;
}

605
/* Must follow skb_ensure_writable() since that can move the skb data. */
606
static void set_tp_port(struct sk_buff *skb, __be16 *port,
607
			__be16 new_port, __sum16 *check)
608
{
609
	inet_proto_csum_replace2(check, skb, *port, new_port, false);
610
	*port = new_port;
611 612
}

613 614 615
static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key,
		   const struct ovs_key_udp *key,
		   const struct ovs_key_udp *mask)
616 617
{
	struct udphdr *uh;
618
	__be16 src, dst;
619 620
	int err;

621 622
	err = skb_ensure_writable(skb, skb_transport_offset(skb) +
				  sizeof(struct udphdr));
623 624 625 626
	if (unlikely(err))
		return err;

	uh = udp_hdr(skb);
627
	/* Either of the masks is non-zero, so do not bother checking them. */
628 629
	src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src);
	dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst);
630

631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647
	if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) {
		if (likely(src != uh->source)) {
			set_tp_port(skb, &uh->source, src, &uh->check);
			flow_key->tp.src = src;
		}
		if (likely(dst != uh->dest)) {
			set_tp_port(skb, &uh->dest, dst, &uh->check);
			flow_key->tp.dst = dst;
		}

		if (unlikely(!uh->check))
			uh->check = CSUM_MANGLED_0;
	} else {
		uh->source = src;
		uh->dest = dst;
		flow_key->tp.src = src;
		flow_key->tp.dst = dst;
648
	}
649

650 651
	skb_clear_hash(skb);

652 653 654
	return 0;
}

655 656 657
static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key,
		   const struct ovs_key_tcp *key,
		   const struct ovs_key_tcp *mask)
658 659
{
	struct tcphdr *th;
660
	__be16 src, dst;
661 662
	int err;

663 664
	err = skb_ensure_writable(skb, skb_transport_offset(skb) +
				  sizeof(struct tcphdr));
665 666 667 668
	if (unlikely(err))
		return err;

	th = tcp_hdr(skb);
669
	src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src);
670 671 672
	if (likely(src != th->source)) {
		set_tp_port(skb, &th->source, src, &th->check);
		flow_key->tp.src = src;
673
	}
674
	dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst);
675 676 677
	if (likely(dst != th->dest)) {
		set_tp_port(skb, &th->dest, dst, &th->check);
		flow_key->tp.dst = dst;
678
	}
679
	skb_clear_hash(skb);
680 681 682 683

	return 0;
}

684 685 686
static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key,
		    const struct ovs_key_sctp *key,
		    const struct ovs_key_sctp *mask)
J
Joe Stringer 已提交
687
{
688
	unsigned int sctphoff = skb_transport_offset(skb);
J
Joe Stringer 已提交
689
	struct sctphdr *sh;
690
	__le32 old_correct_csum, new_csum, old_csum;
J
Joe Stringer 已提交
691 692
	int err;

693
	err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr));
J
Joe Stringer 已提交
694 695 696 697
	if (unlikely(err))
		return err;

	sh = sctp_hdr(skb);
698 699
	old_csum = sh->checksum;
	old_correct_csum = sctp_compute_cksum(skb, sctphoff);
J
Joe Stringer 已提交
700

701 702
	sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src);
	sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst);
J
Joe Stringer 已提交
703

704
	new_csum = sctp_compute_cksum(skb, sctphoff);
J
Joe Stringer 已提交
705

706 707
	/* Carry any checksum errors through. */
	sh->checksum = old_csum ^ old_correct_csum ^ new_csum;
J
Joe Stringer 已提交
708

709 710 711
	skb_clear_hash(skb);
	flow_key->tp.src = sh->source;
	flow_key->tp.dst = sh->dest;
J
Joe Stringer 已提交
712 713 714 715

	return 0;
}

716
static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *skb)
J
Joe Stringer 已提交
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734
{
	struct ovs_frag_data *data = this_cpu_ptr(&ovs_frag_data_storage);
	struct vport *vport = data->vport;

	if (skb_cow_head(skb, data->l2_len) < 0) {
		kfree_skb(skb);
		return -ENOMEM;
	}

	__skb_dst_copy(skb, data->dst);
	*OVS_CB(skb) = data->cb;
	skb->inner_protocol = data->inner_protocol;
	skb->vlan_tci = data->vlan_tci;
	skb->vlan_proto = data->vlan_proto;

	/* Reconstruct the MAC header.  */
	skb_push(skb, data->l2_len);
	memcpy(skb->data, &data->l2_data, data->l2_len);
735
	skb_postpush_rcsum(skb, skb->data, data->l2_len);
J
Joe Stringer 已提交
736 737
	skb_reset_mac_header(skb);

738 739 740 741 742 743
	if (eth_p_mpls(skb->protocol)) {
		skb->inner_network_header = skb->network_header;
		skb_set_network_header(skb, data->network_offset);
		skb_reset_mac_len(skb);
	}

744
	ovs_vport_send(vport, skb, data->mac_proto);
J
Joe Stringer 已提交
745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761
	return 0;
}

static unsigned int
ovs_dst_get_mtu(const struct dst_entry *dst)
{
	return dst->dev->mtu;
}

static struct dst_ops ovs_dst_ops = {
	.family = AF_UNSPEC,
	.mtu = ovs_dst_get_mtu,
};

/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is
 * ovs_vport_output(), which is called once per fragmented packet.
 */
762
static void prepare_frag(struct vport *vport, struct sk_buff *skb,
763
			 u16 orig_network_offset, u8 mac_proto)
J
Joe Stringer 已提交
764 765 766 767 768 769 770 771 772
{
	unsigned int hlen = skb_network_offset(skb);
	struct ovs_frag_data *data;

	data = this_cpu_ptr(&ovs_frag_data_storage);
	data->dst = skb->_skb_refdst;
	data->vport = vport;
	data->cb = *OVS_CB(skb);
	data->inner_protocol = skb->inner_protocol;
773
	data->network_offset = orig_network_offset;
J
Joe Stringer 已提交
774 775
	data->vlan_tci = skb->vlan_tci;
	data->vlan_proto = skb->vlan_proto;
776
	data->mac_proto = mac_proto;
J
Joe Stringer 已提交
777 778 779 780 781 782 783
	data->l2_len = hlen;
	memcpy(&data->l2_data, skb->data, hlen);

	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
	skb_pull(skb, hlen);
}

784
static void ovs_fragment(struct net *net, struct vport *vport,
785 786
			 struct sk_buff *skb, u16 mru,
			 struct sw_flow_key *key)
J
Joe Stringer 已提交
787
{
788 789 790 791 792 793 794
	u16 orig_network_offset = 0;

	if (eth_p_mpls(skb->protocol)) {
		orig_network_offset = skb_network_offset(skb);
		skb->network_header = skb->inner_network_header;
	}

J
Joe Stringer 已提交
795 796
	if (skb_network_offset(skb) > MAX_L2_LEN) {
		OVS_NLERR(1, "L2 header too long to fragment");
797
		goto err;
J
Joe Stringer 已提交
798 799
	}

800
	if (key->eth.type == htons(ETH_P_IP)) {
J
Joe Stringer 已提交
801 802 803
		struct dst_entry ovs_dst;
		unsigned long orig_dst;

804 805
		prepare_frag(vport, skb, orig_network_offset,
			     ovs_key_mac_proto(key));
J
Joe Stringer 已提交
806 807 808 809 810 811 812 813
		dst_init(&ovs_dst, &ovs_dst_ops, NULL, 1,
			 DST_OBSOLETE_NONE, DST_NOCOUNT);
		ovs_dst.dev = vport->dev;

		orig_dst = skb->_skb_refdst;
		skb_dst_set_noref(skb, &ovs_dst);
		IPCB(skb)->frag_max_size = mru;

814
		ip_do_fragment(net, skb->sk, skb, ovs_vport_output);
J
Joe Stringer 已提交
815
		refdst_drop(orig_dst);
816
	} else if (key->eth.type == htons(ETH_P_IPV6)) {
J
Joe Stringer 已提交
817 818 819 820
		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
		unsigned long orig_dst;
		struct rt6_info ovs_rt;

821
		if (!v6ops)
822
			goto err;
J
Joe Stringer 已提交
823

824 825
		prepare_frag(vport, skb, orig_network_offset,
			     ovs_key_mac_proto(key));
J
Joe Stringer 已提交
826 827 828 829 830 831 832 833 834
		memset(&ovs_rt, 0, sizeof(ovs_rt));
		dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, 1,
			 DST_OBSOLETE_NONE, DST_NOCOUNT);
		ovs_rt.dst.dev = vport->dev;

		orig_dst = skb->_skb_refdst;
		skb_dst_set_noref(skb, &ovs_rt.dst);
		IP6CB(skb)->frag_max_size = mru;

835
		v6ops->fragment(net, skb->sk, skb, ovs_vport_output);
J
Joe Stringer 已提交
836 837 838
		refdst_drop(orig_dst);
	} else {
		WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.",
839
			  ovs_vport_name(vport), ntohs(key->eth.type), mru,
J
Joe Stringer 已提交
840
			  vport->dev->mtu);
841
		goto err;
J
Joe Stringer 已提交
842
	}
843 844 845 846

	return;
err:
	kfree_skb(skb);
J
Joe Stringer 已提交
847 848 849 850
}

static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
		      struct sw_flow_key *key)
851
{
852
	struct vport *vport = ovs_vport_rcu(dp, out_port);
853

J
Joe Stringer 已提交
854 855
	if (likely(vport)) {
		u16 mru = OVS_CB(skb)->mru;
856 857 858
		u32 cutlen = OVS_CB(skb)->cutlen;

		if (unlikely(cutlen > 0)) {
859
			if (skb->len - cutlen > ovs_mac_header_len(key))
860 861
				pskb_trim(skb, skb->len - cutlen);
			else
862
				pskb_trim(skb, ovs_mac_header_len(key));
863
		}
J
Joe Stringer 已提交
864

865 866
		if (likely(!mru ||
		           (skb->len <= mru + vport->dev->hard_header_len))) {
867
			ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
J
Joe Stringer 已提交
868
		} else if (mru <= vport->dev->mtu) {
869
			struct net *net = read_pnet(&dp->net);
J
Joe Stringer 已提交
870

871
			ovs_fragment(net, vport, skb, mru, key);
J
Joe Stringer 已提交
872 873 874 875
		} else {
			kfree_skb(skb);
		}
	} else {
876
		kfree_skb(skb);
J
Joe Stringer 已提交
877
	}
878 879 880
}

static int output_userspace(struct datapath *dp, struct sk_buff *skb,
881
			    struct sw_flow_key *key, const struct nlattr *attr,
882 883
			    const struct nlattr *actions, int actions_len,
			    uint32_t cutlen)
884 885 886 887 888
{
	struct dp_upcall_info upcall;
	const struct nlattr *a;
	int rem;

889
	memset(&upcall, 0, sizeof(upcall));
890
	upcall.cmd = OVS_PACKET_CMD_ACTION;
J
Joe Stringer 已提交
891
	upcall.mru = OVS_CB(skb)->mru;
892 893 894 895 896 897 898 899 900

	for (a = nla_data(attr), rem = nla_len(attr); rem > 0;
		 a = nla_next(a, &rem)) {
		switch (nla_type(a)) {
		case OVS_USERSPACE_ATTR_USERDATA:
			upcall.userdata = a;
			break;

		case OVS_USERSPACE_ATTR_PID:
901
			upcall.portid = nla_get_u32(a);
902
			break;
903 904 905 906 907 908 909 910 911

		case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: {
			/* Get out tunnel info. */
			struct vport *vport;

			vport = ovs_vport_rcu(dp, nla_get_u32(a));
			if (vport) {
				int err;

912 913 914
				err = dev_fill_metadata_dst(vport->dev, skb);
				if (!err)
					upcall.egress_tun_info = skb_tunnel_info(skb);
915
			}
916

917
			break;
918
		}
919

920 921 922 923 924 925 926
		case OVS_USERSPACE_ATTR_ACTIONS: {
			/* Include actions. */
			upcall.actions = actions;
			upcall.actions_len = actions_len;
			break;
		}

927
		} /* End of switch. */
928 929
	}

930
	return ovs_dp_upcall(dp, skb, key, &upcall, cutlen);
931 932
}

933 934 935 936
/* When 'last' is true, sample() should always consume the 'skb'.
 * Otherwise, sample() should keep 'skb' intact regardless what
 * actions are executed within sample().
 */
937
static int sample(struct datapath *dp, struct sk_buff *skb,
938
		  struct sw_flow_key *key, const struct nlattr *attr,
939
		  bool last)
940
{
941 942 943 944
	struct nlattr *actions;
	struct nlattr *sample_arg;
	int rem = nla_len(attr);
	const struct sample_arg *arg;
945
	bool clone_flow_key;
946

947 948 949 950
	/* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */
	sample_arg = nla_data(attr);
	arg = nla_data(sample_arg);
	actions = nla_next(sample_arg, &rem);
951

952 953 954 955 956
	if ((arg->probability != U32_MAX) &&
	    (!arg->probability || prandom_u32() > arg->probability)) {
		if (last)
			consume_skb(skb);
		return 0;
957 958
	}

959 960 961
	clone_flow_key = !arg->exec;
	return clone_execute(dp, skb, key, 0, actions, rem, last,
			     clone_flow_key);
962 963 964 965 966 967 968 969 970 971 972 973 974 975 976
}

static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key,
			 const struct nlattr *attr)
{
	struct ovs_action_hash *hash_act = nla_data(attr);
	u32 hash = 0;

	/* OVS_HASH_ALG_L4 is the only possible hash algorithm.  */
	hash = skb_get_hash(skb);
	hash = jhash_1word(hash, hash_act->hash_basis);
	if (!hash)
		hash = 0x1;

	key->ovs_flow_hash = hash;
977 978
}

979 980 981 982 983 984
static int execute_set_action(struct sk_buff *skb,
			      struct sw_flow_key *flow_key,
			      const struct nlattr *a)
{
	/* Only tunnel set execution is supported without a mask. */
	if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) {
985 986 987 988 989
		struct ovs_tunnel_info *tun = nla_data(a);

		skb_dst_drop(skb);
		dst_hold((struct dst_entry *)tun->tun_dst);
		skb_dst_set(skb, (struct dst_entry *)tun->tun_dst);
990 991 992 993 994 995 996 997 998 999 1000 1001
		return 0;
	}

	return -EINVAL;
}

/* Mask is at the midpoint of the data. */
#define get_mask(a, type) ((const type)nla_data(a) + 1)

static int execute_masked_set_action(struct sk_buff *skb,
				     struct sw_flow_key *flow_key,
				     const struct nlattr *a)
1002 1003 1004
{
	int err = 0;

1005
	switch (nla_type(a)) {
1006
	case OVS_KEY_ATTR_PRIORITY:
1007 1008
		OVS_SET_MASKED(skb->priority, nla_get_u32(a),
			       *get_mask(a, u32 *));
1009
		flow_key->phy.priority = skb->priority;
1010 1011
		break;

1012
	case OVS_KEY_ATTR_SKB_MARK:
1013
		OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *));
1014
		flow_key->phy.skb_mark = skb->mark;
1015 1016
		break;

1017
	case OVS_KEY_ATTR_TUNNEL_INFO:
1018 1019
		/* Masked data not supported for tunnel. */
		err = -EINVAL;
1020 1021
		break;

1022
	case OVS_KEY_ATTR_ETHERNET:
1023 1024
		err = set_eth_addr(skb, flow_key, nla_data(a),
				   get_mask(a, struct ovs_key_ethernet *));
1025 1026 1027
		break;

	case OVS_KEY_ATTR_IPV4:
1028 1029
		err = set_ipv4(skb, flow_key, nla_data(a),
			       get_mask(a, struct ovs_key_ipv4 *));
1030 1031
		break;

A
Ansis Atteka 已提交
1032
	case OVS_KEY_ATTR_IPV6:
1033 1034
		err = set_ipv6(skb, flow_key, nla_data(a),
			       get_mask(a, struct ovs_key_ipv6 *));
A
Ansis Atteka 已提交
1035 1036
		break;

1037
	case OVS_KEY_ATTR_TCP:
1038 1039
		err = set_tcp(skb, flow_key, nla_data(a),
			      get_mask(a, struct ovs_key_tcp *));
1040 1041 1042
		break;

	case OVS_KEY_ATTR_UDP:
1043 1044
		err = set_udp(skb, flow_key, nla_data(a),
			      get_mask(a, struct ovs_key_udp *));
1045
		break;
J
Joe Stringer 已提交
1046 1047

	case OVS_KEY_ATTR_SCTP:
1048 1049
		err = set_sctp(skb, flow_key, nla_data(a),
			       get_mask(a, struct ovs_key_sctp *));
J
Joe Stringer 已提交
1050
		break;
1051 1052

	case OVS_KEY_ATTR_MPLS:
1053 1054
		err = set_mpls(skb, flow_key, nla_data(a), get_mask(a,
								    __be32 *));
1055
		break;
J
Joe Stringer 已提交
1056 1057 1058

	case OVS_KEY_ATTR_CT_STATE:
	case OVS_KEY_ATTR_CT_ZONE:
1059
	case OVS_KEY_ATTR_CT_MARK:
J
Joe Stringer 已提交
1060
	case OVS_KEY_ATTR_CT_LABELS:
1061 1062
	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4:
	case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6:
J
Joe Stringer 已提交
1063 1064
		err = -EINVAL;
		break;
1065 1066 1067 1068 1069
	}

	return err;
}

1070 1071
static int execute_recirc(struct datapath *dp, struct sk_buff *skb,
			  struct sw_flow_key *key,
1072
			  const struct nlattr *a, bool last)
1073
{
1074
	u32 recirc_id;
1075

1076 1077 1078 1079 1080 1081 1082 1083
	if (!is_flow_key_valid(key)) {
		int err;

		err = ovs_flow_key_update(skb, key);
		if (err)
			return err;
	}
	BUG_ON(!is_flow_key_valid(key));
1084

1085 1086
	recirc_id = nla_get_u32(a);
	return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true);
1087 1088
}

1089 1090
/* Execute a list of actions against 'skb'. */
static int do_execute_actions(struct datapath *dp, struct sk_buff *skb,
1091
			      struct sw_flow_key *key,
1092
			      const struct nlattr *attr, int len)
1093 1094 1095 1096 1097 1098 1099 1100
{
	const struct nlattr *a;
	int rem;

	for (a = attr, rem = len; rem > 0;
	     a = nla_next(a, &rem)) {
		int err = 0;

1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115
		switch (nla_type(a)) {
		case OVS_ACTION_ATTR_OUTPUT: {
			int port = nla_get_u32(a);
			struct sk_buff *clone;

			/* Every output action needs a separate clone
			 * of 'skb', In case the output action is the
			 * last action, cloning can be avoided.
			 */
			if (nla_is_last(a, rem)) {
				do_output(dp, skb, port, key);
				/* 'skb' has been used for output.
				 */
				return 0;
			}
1116

1117 1118 1119
			clone = skb_clone(skb, GFP_ATOMIC);
			if (clone)
				do_output(dp, clone, port, key);
1120
			OVS_CB(skb)->cutlen = 0;
1121
			break;
1122
		}
1123

1124 1125 1126 1127 1128 1129 1130 1131
		case OVS_ACTION_ATTR_TRUNC: {
			struct ovs_action_trunc *trunc = nla_data(a);

			if (skb->len > trunc->max_len)
				OVS_CB(skb)->cutlen = skb->len - trunc->max_len;
			break;
		}

1132
		case OVS_ACTION_ATTR_USERSPACE:
1133 1134 1135
			output_userspace(dp, skb, key, a, attr,
						     len, OVS_CB(skb)->cutlen);
			OVS_CB(skb)->cutlen = 0;
1136 1137
			break;

1138 1139 1140 1141
		case OVS_ACTION_ATTR_HASH:
			execute_hash(skb, key, a);
			break;

1142
		case OVS_ACTION_ATTR_PUSH_MPLS:
1143
			err = push_mpls(skb, key, nla_data(a));
1144 1145 1146
			break;

		case OVS_ACTION_ATTR_POP_MPLS:
1147
			err = pop_mpls(skb, key, nla_get_be16(a));
1148 1149
			break;

1150
		case OVS_ACTION_ATTR_PUSH_VLAN:
1151
			err = push_vlan(skb, key, nla_data(a));
1152 1153 1154
			break;

		case OVS_ACTION_ATTR_POP_VLAN:
1155
			err = pop_vlan(skb, key);
1156 1157
			break;

1158 1159 1160 1161 1162
		case OVS_ACTION_ATTR_RECIRC: {
			bool last = nla_is_last(a, rem);

			err = execute_recirc(dp, skb, key, a, last);
			if (last) {
1163 1164 1165 1166 1167 1168 1169
				/* If this is the last action, the skb has
				 * been consumed or freed.
				 * Return immediately.
				 */
				return err;
			}
			break;
1170
		}
1171

1172
		case OVS_ACTION_ATTR_SET:
1173
			err = execute_set_action(skb, key, nla_data(a));
1174 1175
			break;

1176 1177 1178 1179 1180
		case OVS_ACTION_ATTR_SET_MASKED:
		case OVS_ACTION_ATTR_SET_TO_MASKED:
			err = execute_masked_set_action(skb, key, nla_data(a));
			break;

1181 1182 1183 1184 1185 1186 1187
		case OVS_ACTION_ATTR_SAMPLE: {
			bool last = nla_is_last(a, rem);

			err = sample(dp, skb, key, a, last);
			if (last)
				return err;

1188
			break;
1189
		}
J
Joe Stringer 已提交
1190 1191

		case OVS_ACTION_ATTR_CT:
1192 1193 1194 1195 1196 1197
			if (!is_flow_key_valid(key)) {
				err = ovs_flow_key_update(skb, key);
				if (err)
					return err;
			}

J
Joe Stringer 已提交
1198 1199 1200 1201
			err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key,
					     nla_data(a));

			/* Hide stolen IP fragments from user space. */
1202 1203
			if (err)
				return err == -EINPROGRESS ? 0 : err;
J
Joe Stringer 已提交
1204
			break;
1205

E
Eric Garver 已提交
1206 1207 1208 1209
		case OVS_ACTION_ATTR_CT_CLEAR:
			err = ovs_ct_clear(skb, key);
			break;

1210 1211 1212 1213 1214 1215 1216
		case OVS_ACTION_ATTR_PUSH_ETH:
			err = push_eth(skb, key, nla_data(a));
			break;

		case OVS_ACTION_ATTR_POP_ETH:
			err = pop_eth(skb, key);
			break;
1217 1218 1219 1220 1221 1222 1223 1224
		}

		if (unlikely(err)) {
			kfree_skb(skb);
			return err;
		}
	}

1225
	consume_skb(skb);
1226 1227 1228
	return 0;
}

1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301
/* Execute the actions on the clone of the packet. The effect of the
 * execution does not affect the original 'skb' nor the original 'key'.
 *
 * The execution may be deferred in case the actions can not be executed
 * immediately.
 */
static int clone_execute(struct datapath *dp, struct sk_buff *skb,
			 struct sw_flow_key *key, u32 recirc_id,
			 const struct nlattr *actions, int len,
			 bool last, bool clone_flow_key)
{
	struct deferred_action *da;
	struct sw_flow_key *clone;

	skb = last ? skb : skb_clone(skb, GFP_ATOMIC);
	if (!skb) {
		/* Out of memory, skip this action.
		 */
		return 0;
	}

	/* When clone_flow_key is false, the 'key' will not be change
	 * by the actions, then the 'key' can be used directly.
	 * Otherwise, try to clone key from the next recursion level of
	 * 'flow_keys'. If clone is successful, execute the actions
	 * without deferring.
	 */
	clone = clone_flow_key ? clone_key(key) : key;
	if (clone) {
		int err = 0;

		if (actions) { /* Sample action */
			if (clone_flow_key)
				__this_cpu_inc(exec_actions_level);

			err = do_execute_actions(dp, skb, clone,
						 actions, len);

			if (clone_flow_key)
				__this_cpu_dec(exec_actions_level);
		} else { /* Recirc action */
			clone->recirc_id = recirc_id;
			ovs_dp_process_packet(skb, clone);
		}
		return err;
	}

	/* Out of 'flow_keys' space. Defer actions */
	da = add_deferred_actions(skb, key, actions, len);
	if (da) {
		if (!actions) { /* Recirc action */
			key = &da->pkt_key;
			key->recirc_id = recirc_id;
		}
	} else {
		/* Out of per CPU action FIFO space. Drop the 'skb' and
		 * log an error.
		 */
		kfree_skb(skb);

		if (net_ratelimit()) {
			if (actions) { /* Sample action */
				pr_warn("%s: deferred action limit reached, drop sample action\n",
					ovs_dp_name(dp));
			} else {  /* Recirc action */
				pr_warn("%s: deferred action limit reached, drop recirc action\n",
					ovs_dp_name(dp));
			}
		}
	}
	return 0;
}

1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315
static void process_deferred_actions(struct datapath *dp)
{
	struct action_fifo *fifo = this_cpu_ptr(action_fifos);

	/* Do not touch the FIFO in case there is no deferred actions. */
	if (action_fifo_is_empty(fifo))
		return;

	/* Finishing executing all deferred actions. */
	do {
		struct deferred_action *da = action_fifo_get(fifo);
		struct sk_buff *skb = da->skb;
		struct sw_flow_key *key = &da->pkt_key;
		const struct nlattr *actions = da->actions;
1316
		int actions_len = da->actions_len;
1317 1318

		if (actions)
1319
			do_execute_actions(dp, skb, key, actions, actions_len);
1320 1321 1322 1323 1324 1325 1326 1327
		else
			ovs_dp_process_packet(skb, key);
	} while (!action_fifo_is_empty(fifo));

	/* Reset FIFO for the next packet.  */
	action_fifo_init(fifo);
}

1328
/* Execute a list of actions against 'skb'. */
1329
int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
1330 1331
			const struct sw_flow_actions *acts,
			struct sw_flow_key *key)
1332
{
1333 1334 1335
	int err, level;

	level = __this_cpu_inc_return(exec_actions_level);
1336
	if (unlikely(level > OVS_RECURSION_LIMIT)) {
1337 1338 1339 1340 1341 1342
		net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
				     ovs_dp_name(dp));
		kfree_skb(skb);
		err = -ENETDOWN;
		goto out;
	}
1343

1344
	OVS_CB(skb)->acts_origlen = acts->orig_len;
1345 1346 1347
	err = do_execute_actions(dp, skb, key,
				 acts->actions, acts->actions_len);

1348
	if (level == 1)
1349 1350
		process_deferred_actions(dp);

1351 1352
out:
	__this_cpu_dec(exec_actions_level);
1353 1354 1355 1356 1357 1358 1359 1360
	return err;
}

int action_fifos_init(void)
{
	action_fifos = alloc_percpu(struct action_fifo);
	if (!action_fifos)
		return -ENOMEM;
1361

1362 1363
	flow_keys = alloc_percpu(struct action_flow_keys);
	if (!flow_keys) {
1364 1365 1366 1367
		free_percpu(action_fifos);
		return -ENOMEM;
	}

1368 1369 1370 1371 1372 1373
	return 0;
}

void action_fifos_exit(void)
{
	free_percpu(action_fifos);
1374
	free_percpu(flow_keys);
1375
}