flow_table.c 22.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
 * Copyright (c) 2007-2014 Nicira, Inc.
4 5 6 7
 */

#include "flow.h"
#include "datapath.h"
8
#include "flow_netlink.h"
9 10 11 12 13 14 15
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <net/llc_pdu.h>
#include <linux/kernel.h>
16
#include <linux/jhash.h>
17 18 19 20 21
#include <linux/jiffies.h>
#include <linux/llc.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
22
#include <linux/cpumask.h>
23 24 25 26 27 28 29 30 31 32 33 34 35
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>

36
#define TBL_MIN_BUCKETS		1024
37
#define MASK_ARRAY_SIZE_MIN	16
38 39
#define REHASH_INTERVAL		(10 * 60 * HZ)

40 41 42 43
#define MC_HASH_SHIFT		8
#define MC_HASH_ENTRIES		(1u << MC_HASH_SHIFT)
#define MC_HASH_SEGS		((sizeof(uint32_t) * 8) / MC_HASH_SHIFT)

44
static struct kmem_cache *flow_cache;
45
struct kmem_cache *flow_stats_cache __read_mostly;
46 47 48 49 50 51 52

static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
	return range->end - range->start;
}

void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
53
		       bool full, const struct sw_flow_mask *mask)
54
{
55 56 57 58 59
	int start = full ? 0 : mask->range.start;
	int len = full ? sizeof *dst : range_n_bytes(&mask->range);
	const long *m = (const long *)((const u8 *)&mask->key + start);
	const long *s = (const long *)((const u8 *)src + start);
	long *d = (long *)((u8 *)dst + start);
60 61
	int i;

62 63 64 65
	/* If 'full' is true then all of 'dst' is fully initialized. Otherwise,
	 * if 'full' is false the memory outside of the 'mask->range' is left
	 * uninitialized. This can be used as an optimization when further
	 * operations on 'dst' only use contents within 'mask->range'.
66
	 */
67
	for (i = 0; i < len; i += sizeof(long))
68 69 70
		*d++ = *s++ & *m++;
}

71
struct sw_flow *ovs_flow_alloc(void)
72 73
{
	struct sw_flow *flow;
74
	struct sw_flow_stats *stats;
75

76
	flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL);
77 78 79
	if (!flow)
		return ERR_PTR(-ENOMEM);

80
	flow->stats_last_writer = -1;
81

82 83
	/* Initialize the default stat node. */
	stats = kmem_cache_alloc_node(flow_stats_cache,
84 85
				      GFP_KERNEL | __GFP_ZERO,
				      node_online(0) ? 0 : NUMA_NO_NODE);
86
	if (!stats)
87
		goto err;
88

89 90 91 92
	spin_lock_init(&stats->lock);

	RCU_INIT_POINTER(flow->stats[0], stats);

93 94
	cpumask_set_cpu(0, &flow->cpu_used_mask);

95
	return flow;
96
err:
97
	kmem_cache_free(flow_cache, flow);
98
	return ERR_PTR(-ENOMEM);
99 100
}

101
int ovs_flow_tbl_count(const struct flow_table *table)
102 103 104 105
{
	return table->count;
}

106 107
static void flow_free(struct sw_flow *flow)
{
108
	int cpu;
109

110 111
	if (ovs_identifier_is_key(&flow->id))
		kfree(flow->id.unmasked_key);
112 113
	if (flow->sf_acts)
		ovs_nla_free_flow_actions((struct sw_flow_actions __force *)flow->sf_acts);
114
	/* We open code this to make sure cpu 0 is always considered */
115
	for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, &flow->cpu_used_mask))
116
		if (flow->stats[cpu])
117
			kmem_cache_free(flow_stats_cache,
118
					(struct sw_flow_stats __force *)flow->stats[cpu]);
119 120 121 122 123 124 125 126 127 128
	kmem_cache_free(flow_cache, flow);
}

static void rcu_free_flow_callback(struct rcu_head *rcu)
{
	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);

	flow_free(flow);
}

129
void ovs_flow_free(struct sw_flow *flow, bool deferred)
130
{
131
	if (!flow)
132 133
		return;

134 135 136 137 138 139
	if (deferred)
		call_rcu(&flow->rcu, rcu_free_flow_callback);
	else
		flow_free(flow);
}

140
static void __table_instance_destroy(struct table_instance *ti)
141
{
142
	kvfree(ti->buckets);
143
	kfree(ti);
144 145
}

146
static struct table_instance *table_instance_alloc(int new_size)
147
{
148
	struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
149
	int i;
150

151
	if (!ti)
152 153
		return NULL;

154 155
	ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head),
				     GFP_KERNEL);
156 157
	if (!ti->buckets) {
		kfree(ti);
158 159
		return NULL;
	}
160 161 162 163

	for (i = 0; i < new_size; i++)
		INIT_HLIST_HEAD(&ti->buckets[i]);

164 165 166 167
	ti->n_buckets = new_size;
	ti->node_ver = 0;
	ti->keep_flows = false;
	get_random_bytes(&ti->hash_seed, sizeof(u32));
168

169
	return ti;
170 171
}

172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
static struct mask_array *tbl_mask_array_alloc(int size)
{
	struct mask_array *new;

	size = max(MASK_ARRAY_SIZE_MIN, size);
	new = kzalloc(sizeof(struct mask_array) +
		      sizeof(struct sw_flow_mask *) * size, GFP_KERNEL);
	if (!new)
		return NULL;

	new->count = 0;
	new->max = size;

	return new;
}

static int tbl_mask_array_realloc(struct flow_table *tbl, int size)
{
	struct mask_array *old;
	struct mask_array *new;

	new = tbl_mask_array_alloc(size);
	if (!new)
		return -ENOMEM;

	old = ovsl_dereference(tbl->mask_array);
	if (old) {
		int i;

		for (i = 0; i < old->max; i++) {
			if (ovsl_dereference(old->masks[i]))
				new->masks[new->count++] = old->masks[i];
		}
	}

	rcu_assign_pointer(tbl->mask_array, new);
	kfree_rcu(old, rcu);

	return 0;
}

213
int ovs_flow_tbl_init(struct flow_table *table)
214
{
215
	struct table_instance *ti, *ufid_ti;
216
	struct mask_array *ma;
217

218 219 220 221 222
	table->mask_cache = __alloc_percpu(sizeof(struct mask_cache_entry) *
					   MC_HASH_ENTRIES,
					   __alignof__(struct mask_cache_entry));
	if (!table->mask_cache)
		return -ENOMEM;
223

224 225 226 227
	ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN);
	if (!ma)
		goto free_mask_cache;

228
	ti = table_instance_alloc(TBL_MIN_BUCKETS);
229
	if (!ti)
230
		goto free_mask_array;
231

232 233 234 235
	ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
	if (!ufid_ti)
		goto free_ti;

236
	rcu_assign_pointer(table->ti, ti);
237
	rcu_assign_pointer(table->ufid_ti, ufid_ti);
238
	rcu_assign_pointer(table->mask_array, ma);
239 240
	table->last_rehash = jiffies;
	table->count = 0;
241
	table->ufid_count = 0;
242
	return 0;
243 244 245

free_ti:
	__table_instance_destroy(ti);
246 247
free_mask_array:
	kfree(ma);
248 249
free_mask_cache:
	free_percpu(table->mask_cache);
250
	return -ENOMEM;
251 252 253 254
}

static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
255
	struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
256

257
	__table_instance_destroy(ti);
258 259
}

260 261 262
static void table_instance_destroy(struct table_instance *ti,
				   struct table_instance *ufid_ti,
				   bool deferred)
263
{
264 265
	int i;

266
	if (!ti)
267 268
		return;

269
	BUG_ON(!ufid_ti);
270 271 272 273 274
	if (ti->keep_flows)
		goto skip_flows;

	for (i = 0; i < ti->n_buckets; i++) {
		struct sw_flow *flow;
275
		struct hlist_head *head = &ti->buckets[i];
276 277
		struct hlist_node *n;
		int ver = ti->node_ver;
278
		int ufid_ver = ufid_ti->node_ver;
279

280 281 282 283
		hlist_for_each_entry_safe(flow, n, head, flow_table.node[ver]) {
			hlist_del_rcu(&flow->flow_table.node[ver]);
			if (ovs_identifier_is_ufid(&flow->id))
				hlist_del_rcu(&flow->ufid_table.node[ufid_ver]);
284 285 286 287 288
			ovs_flow_free(flow, deferred);
		}
	}

skip_flows:
289
	if (deferred) {
290
		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
291 292
		call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb);
	} else {
293
		__table_instance_destroy(ti);
294 295
		__table_instance_destroy(ufid_ti);
	}
296 297
}

298 299 300 301
/* No need for locking this function is called from RCU callback or
 * error path.
 */
void ovs_flow_tbl_destroy(struct flow_table *table)
302
{
303
	struct table_instance *ti = rcu_dereference_raw(table->ti);
304
	struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti);
305

306
	free_percpu(table->mask_cache);
307
	kfree_rcu(rcu_dereference_raw(table->mask_array), rcu);
308
	table_instance_destroy(ti, ufid_ti, false);
309 310
}

311
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
312 313 314 315 316 317 318
				       u32 *bucket, u32 *last)
{
	struct sw_flow *flow;
	struct hlist_head *head;
	int ver;
	int i;

319 320
	ver = ti->node_ver;
	while (*bucket < ti->n_buckets) {
321
		i = 0;
322
		head = &ti->buckets[*bucket];
323
		hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) {
324 325 326 327 328 329 330 331 332 333 334 335 336 337
			if (i < *last) {
				i++;
				continue;
			}
			*last = i + 1;
			return flow;
		}
		(*bucket)++;
		*last = 0;
	}

	return NULL;
}

338
static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
339
{
340
	hash = jhash_1word(hash, ti->hash_seed);
341
	return &ti->buckets[hash & (ti->n_buckets - 1)];
342 343
}

344 345
static void table_instance_insert(struct table_instance *ti,
				  struct sw_flow *flow)
346 347 348
{
	struct hlist_head *head;

349 350 351 352 353 354 355 356 357 358 359
	head = find_bucket(ti, flow->flow_table.hash);
	hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head);
}

static void ufid_table_instance_insert(struct table_instance *ti,
				       struct sw_flow *flow)
{
	struct hlist_head *head;

	head = find_bucket(ti, flow->ufid_table.hash);
	hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head);
360 361
}

362
static void flow_table_copy_flows(struct table_instance *old,
363
				  struct table_instance *new, bool ufid)
364 365 366 367 368 369 370 371 372 373
{
	int old_ver;
	int i;

	old_ver = old->node_ver;
	new->node_ver = !old_ver;

	/* Insert in new table. */
	for (i = 0; i < old->n_buckets; i++) {
		struct sw_flow *flow;
374
		struct hlist_head *head = &old->buckets[i];
375

376 377 378 379 380 381 382 383
		if (ufid)
			hlist_for_each_entry(flow, head,
					     ufid_table.node[old_ver])
				ufid_table_instance_insert(new, flow);
		else
			hlist_for_each_entry(flow, head,
					     flow_table.node[old_ver])
				table_instance_insert(new, flow);
384 385 386 387 388
	}

	old->keep_flows = true;
}

389
static struct table_instance *table_instance_rehash(struct table_instance *ti,
390
						    int n_buckets, bool ufid)
391
{
392
	struct table_instance *new_ti;
393

394 395
	new_ti = table_instance_alloc(n_buckets);
	if (!new_ti)
396
		return NULL;
397

398
	flow_table_copy_flows(ti, new_ti, ufid);
399

400
	return new_ti;
401 402
}

403
int ovs_flow_tbl_flush(struct flow_table *flow_table)
404
{
405 406
	struct table_instance *old_ti, *new_ti;
	struct table_instance *old_ufid_ti, *new_ufid_ti;
407

408 409 410
	new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
	if (!new_ti)
		return -ENOMEM;
411 412 413 414 415 416
	new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS);
	if (!new_ufid_ti)
		goto err_free_ti;

	old_ti = ovsl_dereference(flow_table->ti);
	old_ufid_ti = ovsl_dereference(flow_table->ufid_ti);
417 418

	rcu_assign_pointer(flow_table->ti, new_ti);
419
	rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti);
420 421
	flow_table->last_rehash = jiffies;
	flow_table->count = 0;
422
	flow_table->ufid_count = 0;
423

424
	table_instance_destroy(old_ti, old_ufid_ti, true);
425
	return 0;
426 427 428 429

err_free_ti:
	__table_instance_destroy(new_ti);
	return -ENOMEM;
430 431
}

432 433
static u32 flow_hash(const struct sw_flow_key *key,
		     const struct sw_flow_key_range *range)
434
{
435
	const u32 *hash_key = (const u32 *)((const u8 *)key + range->start);
436 437

	/* Make sure number of hash bytes are multiple of u32. */
438
	int hash_u32s = range_n_bytes(range) >> 2;
439

440
	return jhash2(hash_key, hash_u32s, 0);
441 442 443 444
}

static int flow_key_start(const struct sw_flow_key *key)
{
445
	if (key->tun_proto)
446 447 448 449 450 451 452 453 454 455
		return 0;
	else
		return rounddown(offsetof(struct sw_flow_key, phy),
					  sizeof(long));
}

static bool cmp_key(const struct sw_flow_key *key1,
		    const struct sw_flow_key *key2,
		    int key_start, int key_end)
{
456 457
	const long *cp1 = (const long *)((const u8 *)key1 + key_start);
	const long *cp2 = (const long *)((const u8 *)key2 + key_start);
458 459 460 461 462 463 464 465 466 467 468
	long diffs = 0;
	int i;

	for (i = key_start; i < key_end;  i += sizeof(long))
		diffs |= *cp1++ ^ *cp2++;

	return diffs == 0;
}

static bool flow_cmp_masked_key(const struct sw_flow *flow,
				const struct sw_flow_key *key,
469
				const struct sw_flow_key_range *range)
470
{
471
	return cmp_key(&flow->key, key, range->start, range->end);
472 473
}

474 475
static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
				      const struct sw_flow_match *match)
476 477 478 479 480
{
	struct sw_flow_key *key = match->key;
	int key_start = flow_key_start(key);
	int key_end = match->range.end;

481 482
	BUG_ON(ovs_identifier_is_ufid(&flow->id));
	return cmp_key(flow->id.unmasked_key, key, key_start, key_end);
483 484
}

485
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
486
					  const struct sw_flow_key *unmasked,
487 488
					  const struct sw_flow_mask *mask,
					  u32 *n_mask_hit)
489 490 491 492 493 494
{
	struct sw_flow *flow;
	struct hlist_head *head;
	u32 hash;
	struct sw_flow_key masked_key;

495
	ovs_flow_mask_key(&masked_key, unmasked, false, mask);
496
	hash = flow_hash(&masked_key, &mask->range);
497
	head = find_bucket(ti, hash);
498 499
	(*n_mask_hit)++;

500 501
	hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver]) {
		if (flow->mask == mask && flow->flow_table.hash == hash &&
502
		    flow_cmp_masked_key(flow, &masked_key, &mask->range))
503 504 505 506 507
			return flow;
	}
	return NULL;
}

508 509 510
/* Flow lookup does full lookup on flow table. It starts with
 * mask from index passed in *index.
 */
511 512
static struct sw_flow *flow_lookup(struct flow_table *tbl,
				   struct table_instance *ti,
513
				   struct mask_array *ma,
514
				   const struct sw_flow_key *key,
515 516
				   u32 *n_mask_hit,
				   u32 *index)
517
{
518
	struct sw_flow *flow;
519
	struct sw_flow_mask *mask;
520
	int i;
521

522
	if (likely(*index < ma->max)) {
523
		mask = rcu_dereference_ovsl(ma->masks[*index]);
524 525
		if (mask) {
			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
526
			if (flow)
527
				return flow;
528 529 530 531 532 533 534 535 536
		}
	}

	for (i = 0; i < ma->max; i++)  {

		if (i == *index)
			continue;

		mask = rcu_dereference_ovsl(ma->masks[i]);
537
		if (unlikely(!mask))
538
			break;
539 540 541 542 543

		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
		if (flow) { /* Found */
			*index = i;
			return flow;
544
		}
545
	}
546

547 548
	return NULL;
}
549

550 551 552 553 554 555 556 557 558 559 560 561
/*
 * mask_cache maps flow to probable mask. This cache is not tightly
 * coupled cache, It means updates to  mask list can result in inconsistent
 * cache entry in mask cache.
 * This is per cpu cache and is divided in MC_HASH_SEGS segments.
 * In case of a hash collision the entry is hashed in next segment.
 * */
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
					  const struct sw_flow_key *key,
					  u32 skb_hash,
					  u32 *n_mask_hit)
{
562 563 564
	struct mask_array *ma = rcu_dereference(tbl->mask_array);
	struct table_instance *ti = rcu_dereference(tbl->ti);
	struct mask_cache_entry *entries, *ce;
565
	struct sw_flow *flow;
566
	u32 hash;
567 568 569
	int seg;

	*n_mask_hit = 0;
570
	if (unlikely(!skb_hash)) {
571
		u32 mask_index = 0;
572 573 574

		return flow_lookup(tbl, ti, ma, key, n_mask_hit, &mask_index);
	}
575

576 577 578 579 580 581 582 583
	/* Pre and post recirulation flows usually have the same skb_hash
	 * value. To avoid hash collisions, rehash the 'skb_hash' with
	 * 'recirc_id'.  */
	if (key->recirc_id)
		skb_hash = jhash_1word(skb_hash, key->recirc_id);

	ce = NULL;
	hash = skb_hash;
584 585
	entries = this_cpu_ptr(tbl->mask_cache);

586
	/* Find the cache entry 'ce' to operate on. */
587
	for (seg = 0; seg < MC_HASH_SEGS; seg++) {
588 589 590 591 592 593 594 595 596 597
		int index = hash & (MC_HASH_ENTRIES - 1);
		struct mask_cache_entry *e;

		e = &entries[index];
		if (e->skb_hash == skb_hash) {
			flow = flow_lookup(tbl, ti, ma, key, n_mask_hit,
					   &e->mask_index);
			if (!flow)
				e->skb_hash = 0;
			return flow;
598 599
		}

600 601
		if (!ce || e->skb_hash < ce->skb_hash)
			ce = e;  /* A better replacement cache candidate. */
602 603 604 605

		hash >>= MC_HASH_SHIFT;
	}

606 607
	/* Cache miss, do full lookup. */
	flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, &ce->mask_index);
608
	if (flow)
609
		ce->skb_hash = skb_hash;
610 611 612 613

	return flow;
}

614 615 616
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
				    const struct sw_flow_key *key)
{
617
	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
618
	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
619
	u32 __always_unused n_mask_hit;
620
	u32 index = 0;
621

622
	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &index);
623 624
}

625
struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
626
					  const struct sw_flow_match *match)
627
{
628 629
	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
	int i;
630 631

	/* Always called under ovs-mutex. */
632 633 634 635 636 637 638 639 640 641
	for (i = 0; i < ma->max; i++) {
		struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
		u32 __always_unused n_mask_hit;
		struct sw_flow_mask *mask;
		struct sw_flow *flow;

		mask = ovsl_dereference(ma->masks[i]);
		if (!mask)
			continue;

642
		flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit);
643
		if (flow && ovs_identifier_is_key(&flow->id) &&
644
		    ovs_flow_cmp_unmasked_key(flow, match)) {
645
			return flow;
646
		}
647
	}
648

649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
	return NULL;
}

static u32 ufid_hash(const struct sw_flow_id *sfid)
{
	return jhash(sfid->ufid, sfid->ufid_len, 0);
}

static bool ovs_flow_cmp_ufid(const struct sw_flow *flow,
			      const struct sw_flow_id *sfid)
{
	if (flow->id.ufid_len != sfid->ufid_len)
		return false;

	return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len);
}

bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match)
{
	if (ovs_identifier_is_ufid(&flow->id))
		return flow_cmp_masked_key(flow, match->key, &match->range);

	return ovs_flow_cmp_unmasked_key(flow, match);
}

struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl,
					 const struct sw_flow_id *ufid)
{
	struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti);
	struct sw_flow *flow;
	struct hlist_head *head;
	u32 hash;

	hash = ufid_hash(ufid);
	head = find_bucket(ti, hash);
	hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver]) {
		if (flow->ufid_table.hash == hash &&
		    ovs_flow_cmp_ufid(flow, ufid))
687 688 689 690 691
			return flow;
	}
	return NULL;
}

692 693
int ovs_flow_tbl_num_masks(const struct flow_table *table)
{
694
	struct mask_array *ma = rcu_dereference_ovsl(table->mask_array);
695
	return READ_ONCE(ma->count);
696 697
}

698 699
static struct table_instance *table_instance_expand(struct table_instance *ti,
						    bool ufid)
700
{
701
	return table_instance_rehash(ti, ti->n_buckets * 2, ufid);
702 703
}

704 705
static void tbl_mask_array_del_mask(struct flow_table *tbl,
				    struct sw_flow_mask *mask)
706
{
707 708
	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
	int i, ma_count = READ_ONCE(ma->count);
709 710

	/* Remove the deleted mask pointers from the array */
711 712 713
	for (i = 0; i < ma_count; i++) {
		if (mask == ovsl_dereference(ma->masks[i]))
			goto found;
714
	}
715

716
	BUG();
717 718 719 720 721 722 723 724 725 726 727 728 729 730
	return;

found:
	WRITE_ONCE(ma->count, ma_count -1);

	rcu_assign_pointer(ma->masks[i], ma->masks[ma_count -1]);
	RCU_INIT_POINTER(ma->masks[ma_count -1], NULL);

	kfree_rcu(mask, rcu);

	/* Shrink the mask array if necessary. */
	if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) &&
	    ma_count <= (ma->max / 3))
		tbl_mask_array_realloc(tbl, ma->max / 2);
731 732
}

733 734 735 736 737 738 739 740 741 742 743
/* Remove 'mask' from the mask list, if it is not needed any more. */
static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask)
{
	if (mask) {
		/* ovs-lock is required to protect mask-refcount and
		 * mask list.
		 */
		ASSERT_OVSL();
		BUG_ON(!mask->ref_count);
		mask->ref_count--;

744 745
		if (!mask->ref_count)
			tbl_mask_array_del_mask(tbl, mask);
746 747 748 749
	}
}

/* Must be called with OVS mutex held. */
750 751
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
752
	struct table_instance *ti = ovsl_dereference(table->ti);
753
	struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti);
754

755
	BUG_ON(table->count == 0);
756
	hlist_del_rcu(&flow->flow_table.node[ti->node_ver]);
757
	table->count--;
758 759 760 761
	if (ovs_identifier_is_ufid(&flow->id)) {
		hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]);
		table->ufid_count--;
	}
762 763 764 765 766

	/* RCU delete the mask. 'flow->mask' is not NULLed, as it should be
	 * accessible as long as the RCU read lock is held.
	 */
	flow_mask_remove(table, flow->mask);
767 768
}

769
static struct sw_flow_mask *mask_alloc(void)
770 771 772 773 774
{
	struct sw_flow_mask *mask;

	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
	if (mask)
775
		mask->ref_count = 1;
776 777 778 779 780 781 782

	return mask;
}

static bool mask_equal(const struct sw_flow_mask *a,
		       const struct sw_flow_mask *b)
{
783 784
	const u8 *a_ = (const u8 *)&a->key + a->range.start;
	const u8 *b_ = (const u8 *)&b->key + b->range.start;
785 786 787 788 789 790

	return  (a->range.end == b->range.end)
		&& (a->range.start == b->range.start)
		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
}

791
static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
792 793
					   const struct sw_flow_mask *mask)
{
794 795
	struct mask_array *ma;
	int i;
796

797 798 799 800 801 802 803
	ma = ovsl_dereference(tbl->mask_array);
	for (i = 0; i < ma->max; i++) {
		struct sw_flow_mask *t;
		t = ovsl_dereference(ma->masks[i]);

		if (t && mask_equal(mask, t))
			return t;
804 805 806 807 808
	}

	return NULL;
}

809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
static int tbl_mask_array_add_mask(struct flow_table *tbl,
				   struct sw_flow_mask *new)
{
	struct mask_array *ma = ovsl_dereference(tbl->mask_array);
	int err, ma_count = READ_ONCE(ma->count);

	if (ma_count >= ma->max) {
		err = tbl_mask_array_realloc(tbl, ma->max +
					      MASK_ARRAY_SIZE_MIN);
		if (err)
			return err;

		ma = ovsl_dereference(tbl->mask_array);
	}

	BUG_ON(ovsl_dereference(ma->masks[ma_count]));

	rcu_assign_pointer(ma->masks[ma_count], new);
	WRITE_ONCE(ma->count, ma_count +1);

	return 0;
}

B
Ben Pfaff 已提交
832
/* Add 'mask' into the mask list, if it is not already there. */
833
static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
834
			    const struct sw_flow_mask *new)
835 836
{
	struct sw_flow_mask *mask;
837

838 839 840 841 842 843 844 845
	mask = flow_mask_find(tbl, new);
	if (!mask) {
		/* Allocate a new mask if none exsits. */
		mask = mask_alloc();
		if (!mask)
			return -ENOMEM;
		mask->key = new->key;
		mask->range = new->range;
846 847

		/* Add mask to mask-list. */
848 849 850
		if (tbl_mask_array_add_mask(tbl, mask)) {
			kfree(mask);
			return -ENOMEM;
851
		}
852 853 854
	} else {
		BUG_ON(!mask->ref_count);
		mask->ref_count++;
855 856 857 858 859 860
	}

	flow->mask = mask;
	return 0;
}

861
/* Must be called with OVS mutex held. */
862
static void flow_key_insert(struct flow_table *table, struct sw_flow *flow)
863
{
864 865 866
	struct table_instance *new_ti = NULL;
	struct table_instance *ti;

867
	flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range);
868 869 870 871 872 873
	ti = ovsl_dereference(table->ti);
	table_instance_insert(ti, flow);
	table->count++;

	/* Expand table, if necessary, to make room. */
	if (table->count > ti->n_buckets)
874
		new_ti = table_instance_expand(ti, false);
875
	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
876
		new_ti = table_instance_rehash(ti, ti->n_buckets, false);
877 878 879

	if (new_ti) {
		rcu_assign_pointer(table->ti, new_ti);
880
		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
881 882
		table->last_rehash = jiffies;
	}
883 884
}

885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906
/* Must be called with OVS mutex held. */
static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow)
{
	struct table_instance *ti;

	flow->ufid_table.hash = ufid_hash(&flow->id);
	ti = ovsl_dereference(table->ufid_ti);
	ufid_table_instance_insert(ti, flow);
	table->ufid_count++;

	/* Expand table, if necessary, to make room. */
	if (table->ufid_count > ti->n_buckets) {
		struct table_instance *new_ti;

		new_ti = table_instance_expand(ti, true);
		if (new_ti) {
			rcu_assign_pointer(table->ufid_ti, new_ti);
			call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
		}
	}
}

907 908 909 910 911 912 913 914 915 916
/* Must be called with OVS mutex held. */
int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
			const struct sw_flow_mask *mask)
{
	int err;

	err = flow_mask_insert(table, flow, mask);
	if (err)
		return err;
	flow_key_insert(table, flow);
917 918
	if (ovs_identifier_is_ufid(&flow->id))
		flow_ufid_insert(table, flow);
919

920
	return 0;
921 922 923 924 925 926 927 928 929
}

/* Initializes the flow module.
 * Returns zero if successful or a negative error code. */
int ovs_flow_init(void)
{
	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));

930
	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow)
931
				       + (nr_cpu_ids
932
					  * sizeof(struct sw_flow_stats *)),
933
				       0, 0, NULL);
934 935 936
	if (flow_cache == NULL)
		return -ENOMEM;

937
	flow_stats_cache
938
		= kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats),
939 940 941 942 943 944 945
				    0, SLAB_HWCACHE_ALIGN, NULL);
	if (flow_stats_cache == NULL) {
		kmem_cache_destroy(flow_cache);
		flow_cache = NULL;
		return -ENOMEM;
	}

946 947 948 949 950 951
	return 0;
}

/* Uninitializes the flow module. */
void ovs_flow_exit(void)
{
952
	kmem_cache_destroy(flow_stats_cache);
953 954
	kmem_cache_destroy(flow_cache);
}