flow_table.c 13.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
/*
 * Copyright (c) 2007-2013 Nicira, Inc.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
 * 02110-1301, USA
 */

#include "flow.h"
#include "datapath.h"
#include <linux/uaccess.h>
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <net/llc_pdu.h>
#include <linux/kernel.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/llc.h>
#include <linux/module.h>
#include <linux/in.h>
#include <linux/rcupdate.h>
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/sctp.h>
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/icmp.h>
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>

47 48 49
#define TBL_MIN_BUCKETS		1024
#define REHASH_INTERVAL		(10 * 60 * HZ)

50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
static struct kmem_cache *flow_cache;

static u16 range_n_bytes(const struct sw_flow_key_range *range)
{
	return range->end - range->start;
}

void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src,
		       const struct sw_flow_mask *mask)
{
	const long *m = (long *)((u8 *)&mask->key + mask->range.start);
	const long *s = (long *)((u8 *)src + mask->range.start);
	long *d = (long *)((u8 *)dst + mask->range.start);
	int i;

	/* The memory outside of the 'mask->range' are not set since
	 * further operations on 'dst' only uses contents within
	 * 'mask->range'.
	 */
	for (i = 0; i < range_n_bytes(&mask->range); i += sizeof(long))
		*d++ = *s++ & *m++;
}

73
struct sw_flow *ovs_flow_alloc(bool percpu_stats)
74 75
{
	struct sw_flow *flow;
76
	int cpu;
77 78 79 80 81 82 83 84

	flow = kmem_cache_alloc(flow_cache, GFP_KERNEL);
	if (!flow)
		return ERR_PTR(-ENOMEM);

	flow->sf_acts = NULL;
	flow->mask = NULL;

85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
	flow->stats.is_percpu = percpu_stats;

	if (!percpu_stats) {
		flow->stats.stat = kzalloc(sizeof(*flow->stats.stat), GFP_KERNEL);
		if (!flow->stats.stat)
			goto err;

		spin_lock_init(&flow->stats.stat->lock);
	} else {
		flow->stats.cpu_stats = alloc_percpu(struct flow_stats);
		if (!flow->stats.cpu_stats)
			goto err;

		for_each_possible_cpu(cpu) {
			struct flow_stats *cpu_stats;

			cpu_stats = per_cpu_ptr(flow->stats.cpu_stats, cpu);
			spin_lock_init(&cpu_stats->lock);
		}
	}
105
	return flow;
106 107 108
err:
	kfree(flow);
	return ERR_PTR(-ENOMEM);
109 110
}

111 112 113 114 115
int ovs_flow_tbl_count(struct flow_table *table)
{
	return table->count;
}

116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
static struct flex_array *alloc_buckets(unsigned int n_buckets)
{
	struct flex_array *buckets;
	int i, err;

	buckets = flex_array_alloc(sizeof(struct hlist_head),
				   n_buckets, GFP_KERNEL);
	if (!buckets)
		return NULL;

	err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL);
	if (err) {
		flex_array_free(buckets);
		return NULL;
	}

	for (i = 0; i < n_buckets; i++)
		INIT_HLIST_HEAD((struct hlist_head *)
					flex_array_get(buckets, i));

	return buckets;
}

static void flow_free(struct sw_flow *flow)
{
	kfree((struct sf_flow_acts __force *)flow->sf_acts);
142 143 144 145
	if (flow->stats.is_percpu)
		free_percpu(flow->stats.cpu_stats);
	else
		kfree(flow->stats.stat);
146 147 148 149 150 151 152 153 154 155
	kmem_cache_free(flow_cache, flow);
}

static void rcu_free_flow_callback(struct rcu_head *rcu)
{
	struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu);

	flow_free(flow);
}

156 157 158 159 160 161 162 163 164 165 166
static void flow_mask_del_ref(struct sw_flow_mask *mask, bool deferred)
{
	if (!mask)
		return;

	BUG_ON(!mask->ref_count);
	mask->ref_count--;

	if (!mask->ref_count) {
		list_del_rcu(&mask->list);
		if (deferred)
167
			kfree_rcu(mask, rcu);
168 169 170 171 172
		else
			kfree(mask);
	}
}

173 174 175 176 177
void ovs_flow_free(struct sw_flow *flow, bool deferred)
{
	if (!flow)
		return;

178
	flow_mask_del_ref(flow->mask, deferred);
179 180 181 182 183 184 185 186 187 188 189 190

	if (deferred)
		call_rcu(&flow->rcu, rcu_free_flow_callback);
	else
		flow_free(flow);
}

static void free_buckets(struct flex_array *buckets)
{
	flex_array_free(buckets);
}

191
static void __table_instance_destroy(struct table_instance *ti)
192 193 194
{
	int i;

195
	if (ti->keep_flows)
196 197
		goto skip_flows;

198
	for (i = 0; i < ti->n_buckets; i++) {
199
		struct sw_flow *flow;
200
		struct hlist_head *head = flex_array_get(ti->buckets, i);
201
		struct hlist_node *n;
202
		int ver = ti->node_ver;
203 204 205 206 207 208 209 210

		hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) {
			hlist_del(&flow->hash_node[ver]);
			ovs_flow_free(flow, false);
		}
	}

skip_flows:
211 212
	free_buckets(ti->buckets);
	kfree(ti);
213 214
}

215
static struct table_instance *table_instance_alloc(int new_size)
216
{
217
	struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL);
218

219
	if (!ti)
220 221
		return NULL;

222
	ti->buckets = alloc_buckets(new_size);
223

224 225
	if (!ti->buckets) {
		kfree(ti);
226 227
		return NULL;
	}
228 229 230 231
	ti->n_buckets = new_size;
	ti->node_ver = 0;
	ti->keep_flows = false;
	get_random_bytes(&ti->hash_seed, sizeof(u32));
232

233
	return ti;
234 235
}

236
int ovs_flow_tbl_init(struct flow_table *table)
237
{
238
	struct table_instance *ti;
239

240
	ti = table_instance_alloc(TBL_MIN_BUCKETS);
241

242 243
	if (!ti)
		return -ENOMEM;
244

245 246 247 248 249
	rcu_assign_pointer(table->ti, ti);
	INIT_LIST_HEAD(&table->mask_list);
	table->last_rehash = jiffies;
	table->count = 0;
	return 0;
250 251 252 253
}

static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu)
{
254
	struct table_instance *ti = container_of(rcu, struct table_instance, rcu);
255

256
	__table_instance_destroy(ti);
257 258
}

259
static void table_instance_destroy(struct table_instance *ti, bool deferred)
260
{
261
	if (!ti)
262 263 264
		return;

	if (deferred)
265
		call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb);
266
	else
267 268 269
		__table_instance_destroy(ti);
}

270
void ovs_flow_tbl_destroy(struct flow_table *table)
271 272 273
{
	struct table_instance *ti = ovsl_dereference(table->ti);

274
	table_instance_destroy(ti, false);
275 276
}

277
struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti,
278 279 280 281 282 283 284
				       u32 *bucket, u32 *last)
{
	struct sw_flow *flow;
	struct hlist_head *head;
	int ver;
	int i;

285 286
	ver = ti->node_ver;
	while (*bucket < ti->n_buckets) {
287
		i = 0;
288
		head = flex_array_get(ti->buckets, *bucket);
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
		hlist_for_each_entry_rcu(flow, head, hash_node[ver]) {
			if (i < *last) {
				i++;
				continue;
			}
			*last = i + 1;
			return flow;
		}
		(*bucket)++;
		*last = 0;
	}

	return NULL;
}

304
static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash)
305
{
306 307 308
	hash = jhash_1word(hash, ti->hash_seed);
	return flex_array_get(ti->buckets,
				(hash & (ti->n_buckets - 1)));
309 310
}

311
static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow)
312 313 314
{
	struct hlist_head *head;

315 316
	head = find_bucket(ti, flow->hash);
	hlist_add_head_rcu(&flow->hash_node[ti->node_ver], head);
317 318
}

319 320
static void flow_table_copy_flows(struct table_instance *old,
				  struct table_instance *new)
321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
{
	int old_ver;
	int i;

	old_ver = old->node_ver;
	new->node_ver = !old_ver;

	/* Insert in new table. */
	for (i = 0; i < old->n_buckets; i++) {
		struct sw_flow *flow;
		struct hlist_head *head;

		head = flex_array_get(old->buckets, i);

		hlist_for_each_entry(flow, head, hash_node[old_ver])
336
			table_instance_insert(new, flow);
337 338 339 340 341
	}

	old->keep_flows = true;
}

342
static struct table_instance *table_instance_rehash(struct table_instance *ti,
343 344
					    int n_buckets)
{
345
	struct table_instance *new_ti;
346

347 348
	new_ti = table_instance_alloc(n_buckets);
	if (!new_ti)
349
		return NULL;
350

351
	flow_table_copy_flows(ti, new_ti);
352

353
	return new_ti;
354 355
}

356
int ovs_flow_tbl_flush(struct flow_table *flow_table)
357
{
358 359
	struct table_instance *old_ti;
	struct table_instance *new_ti;
360

361 362 363 364 365 366 367 368 369 370 371
	old_ti = ovsl_dereference(flow_table->ti);
	new_ti = table_instance_alloc(TBL_MIN_BUCKETS);
	if (!new_ti)
		return -ENOMEM;

	rcu_assign_pointer(flow_table->ti, new_ti);
	flow_table->last_rehash = jiffies;
	flow_table->count = 0;

	table_instance_destroy(old_ti, true);
	return 0;
372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426
}

static u32 flow_hash(const struct sw_flow_key *key, int key_start,
		     int key_end)
{
	u32 *hash_key = (u32 *)((u8 *)key + key_start);
	int hash_u32s = (key_end - key_start) >> 2;

	/* Make sure number of hash bytes are multiple of u32. */
	BUILD_BUG_ON(sizeof(long) % sizeof(u32));

	return jhash2(hash_key, hash_u32s, 0);
}

static int flow_key_start(const struct sw_flow_key *key)
{
	if (key->tun_key.ipv4_dst)
		return 0;
	else
		return rounddown(offsetof(struct sw_flow_key, phy),
					  sizeof(long));
}

static bool cmp_key(const struct sw_flow_key *key1,
		    const struct sw_flow_key *key2,
		    int key_start, int key_end)
{
	const long *cp1 = (long *)((u8 *)key1 + key_start);
	const long *cp2 = (long *)((u8 *)key2 + key_start);
	long diffs = 0;
	int i;

	for (i = key_start; i < key_end;  i += sizeof(long))
		diffs |= *cp1++ ^ *cp2++;

	return diffs == 0;
}

static bool flow_cmp_masked_key(const struct sw_flow *flow,
				const struct sw_flow_key *key,
				int key_start, int key_end)
{
	return cmp_key(&flow->key, key, key_start, key_end);
}

bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow,
			       struct sw_flow_match *match)
{
	struct sw_flow_key *key = match->key;
	int key_start = flow_key_start(key);
	int key_end = match->range.end;

	return cmp_key(&flow->unmasked_key, key, key_start, key_end);
}

427
static struct sw_flow *masked_flow_lookup(struct table_instance *ti,
428 429 430 431 432 433 434 435 436 437 438 439
					  const struct sw_flow_key *unmasked,
					  struct sw_flow_mask *mask)
{
	struct sw_flow *flow;
	struct hlist_head *head;
	int key_start = mask->range.start;
	int key_end = mask->range.end;
	u32 hash;
	struct sw_flow_key masked_key;

	ovs_flow_mask_key(&masked_key, unmasked, mask);
	hash = flow_hash(&masked_key, key_start, key_end);
440 441
	head = find_bucket(ti, hash);
	hlist_for_each_entry_rcu(flow, head, hash_node[ti->node_ver]) {
442
		if (flow->mask == mask && flow->hash == hash &&
443 444 445 446 447 448 449
		    flow_cmp_masked_key(flow, &masked_key,
					  key_start, key_end))
			return flow;
	}
	return NULL;
}

450
struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl,
451 452
				    const struct sw_flow_key *key,
				    u32 *n_mask_hit)
453
{
454
	struct table_instance *ti = rcu_dereference_ovsl(tbl->ti);
455
	struct sw_flow_mask *mask;
456
	struct sw_flow *flow;
457

458
	*n_mask_hit = 0;
459
	list_for_each_entry_rcu(mask, &tbl->mask_list, list) {
460
		(*n_mask_hit)++;
461
		flow = masked_flow_lookup(ti, key, mask);
462
		if (flow)  /* Found */
463
			return flow;
464
	}
465 466
	return NULL;
}
467

468 469 470 471 472 473 474 475
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
				    const struct sw_flow_key *key)
{
	u32 __always_unused n_mask_hit;

	return ovs_flow_tbl_lookup_stats(tbl, key, &n_mask_hit);
}

476 477 478 479 480 481 482 483 484 485 486
int ovs_flow_tbl_num_masks(const struct flow_table *table)
{
	struct sw_flow_mask *mask;
	int num = 0;

	list_for_each_entry(mask, &table->mask_list, list)
		num++;

	return num;
}

487 488 489
static struct table_instance *table_instance_expand(struct table_instance *ti)
{
	return table_instance_rehash(ti, ti->n_buckets * 2);
490 491 492 493
}

void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
{
494 495
	struct table_instance *ti = ovsl_dereference(table->ti);

496
	BUG_ON(table->count == 0);
497
	hlist_del_rcu(&flow->hash_node[ti->node_ver]);
498 499 500
	table->count--;
}

501
static struct sw_flow_mask *mask_alloc(void)
502 503 504 505 506 507 508 509 510 511
{
	struct sw_flow_mask *mask;

	mask = kmalloc(sizeof(*mask), GFP_KERNEL);
	if (mask)
		mask->ref_count = 0;

	return mask;
}

512
static void mask_add_ref(struct sw_flow_mask *mask)
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
{
	mask->ref_count++;
}

static bool mask_equal(const struct sw_flow_mask *a,
		       const struct sw_flow_mask *b)
{
	u8 *a_ = (u8 *)&a->key + a->range.start;
	u8 *b_ = (u8 *)&b->key + b->range.start;

	return  (a->range.end == b->range.end)
		&& (a->range.start == b->range.start)
		&& (memcmp(a_, b_, range_n_bytes(&a->range)) == 0);
}

528
static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl,
529 530 531 532
					   const struct sw_flow_mask *mask)
{
	struct list_head *ml;

533
	list_for_each(ml, &tbl->mask_list) {
534 535 536 537 538 539 540 541 542
		struct sw_flow_mask *m;
		m = container_of(ml, struct sw_flow_mask, list);
		if (mask_equal(mask, m))
			return m;
	}

	return NULL;
}

B
Ben Pfaff 已提交
543
/* Add 'mask' into the mask list, if it is not already there. */
544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow,
			    struct sw_flow_mask *new)
{
	struct sw_flow_mask *mask;
	mask = flow_mask_find(tbl, new);
	if (!mask) {
		/* Allocate a new mask if none exsits. */
		mask = mask_alloc();
		if (!mask)
			return -ENOMEM;
		mask->key = new->key;
		mask->range = new->range;
		list_add_rcu(&mask->list, &tbl->mask_list);
	}

	mask_add_ref(mask);
	flow->mask = mask;
	return 0;
}

int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
			struct sw_flow_mask *mask)
566
{
567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
	struct table_instance *new_ti = NULL;
	struct table_instance *ti;
	int err;

	err = flow_mask_insert(table, flow, mask);
	if (err)
		return err;

	flow->hash = flow_hash(&flow->key, flow->mask->range.start,
			flow->mask->range.end);
	ti = ovsl_dereference(table->ti);
	table_instance_insert(ti, flow);
	table->count++;

	/* Expand table, if necessary, to make room. */
	if (table->count > ti->n_buckets)
		new_ti = table_instance_expand(ti);
	else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL))
		new_ti = table_instance_rehash(ti, ti->n_buckets);

	if (new_ti) {
		rcu_assign_pointer(table->ti, new_ti);
		table_instance_destroy(ti, true);
		table->last_rehash = jiffies;
	}
	return 0;
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614
}

/* Initializes the flow module.
 * Returns zero if successful or a negative error code. */
int ovs_flow_init(void)
{
	BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long));
	BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long));

	flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0,
					0, NULL);
	if (flow_cache == NULL)
		return -ENOMEM;

	return 0;
}

/* Uninitializes the flow module. */
void ovs_flow_exit(void)
{
	kmem_cache_destroy(flow_cache);
}