rhashtable.c 29.1 KB
Newer Older
1 2 3
/*
 * Resizable, Scalable, Concurrent Hash Table
 *
4
 * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
5
 * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
6 7 8
 * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
 *
 * Code partially derived from nft_hash
9 10
 * Rewritten with rehash code from br_multicast plus single list
 * pointer as suggested by Josh Triplett
11 12 13 14 15 16
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */

17
#include <linux/atomic.h>
18 19 20
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/log2.h>
E
Eric Dumazet 已提交
21
#include <linux/sched.h>
22
#include <linux/rculist.h>
23 24 25
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mm.h>
26
#include <linux/jhash.h>
27 28
#include <linux/random.h>
#include <linux/rhashtable.h>
29
#include <linux/err.h>
30
#include <linux/export.h>
31 32

#define HASH_DEFAULT_SIZE	64UL
33
#define HASH_MIN_SIZE		4U
34
#define BUCKET_LOCKS_PER_CPU	32UL
35

H
Herbert Xu 已提交
36 37 38 39 40
union nested_table {
	union nested_table __rcu *table;
	struct rhash_head __rcu *bucket;
};

41
static u32 head_hashfn(struct rhashtable *ht,
42 43
		       const struct bucket_table *tbl,
		       const struct rhash_head *he)
44
{
45
	return rht_head_hashfn(ht, tbl, he, ht->p);
46 47
}

48 49 50 51 52 53 54 55 56 57 58
#ifdef CONFIG_PROVE_LOCKING
#define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))

int lockdep_rht_mutex_is_held(struct rhashtable *ht)
{
	return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);

int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
{
59
	spinlock_t *lock = rht_bucket_lock(tbl, hash);
60 61 62 63 64 65 66 67

	return (debug_locks) ? lockdep_is_held(lock) : 1;
}
EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
#else
#define ASSERT_RHT_MUTEX(HT)
#endif

H
Herbert Xu 已提交
68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
static void nested_table_free(union nested_table *ntbl, unsigned int size)
{
	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
	const unsigned int len = 1 << shift;
	unsigned int i;

	ntbl = rcu_dereference_raw(ntbl->table);
	if (!ntbl)
		return;

	if (size > len) {
		size >>= shift;
		for (i = 0; i < len; i++)
			nested_table_free(ntbl + i, size);
	}

	kfree(ntbl);
}

static void nested_bucket_table_free(const struct bucket_table *tbl)
{
	unsigned int size = tbl->size >> tbl->nest;
	unsigned int len = 1 << tbl->nest;
	union nested_table *ntbl;
	unsigned int i;

	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);

	for (i = 0; i < len; i++)
		nested_table_free(ntbl + i, size);

	kfree(ntbl);
}

102 103
static void bucket_table_free(const struct bucket_table *tbl)
{
H
Herbert Xu 已提交
104 105 106
	if (tbl->nest)
		nested_bucket_table_free(tbl);

107
	free_bucket_spinlocks(tbl->locks);
108 109 110
	kvfree(tbl);
}

111 112 113 114 115
static void bucket_table_free_rcu(struct rcu_head *head)
{
	bucket_table_free(container_of(head, struct bucket_table, rcu));
}

H
Herbert Xu 已提交
116 117
static union nested_table *nested_table_alloc(struct rhashtable *ht,
					      union nested_table __rcu **prev,
118
					      bool leaf)
H
Herbert Xu 已提交
119 120 121 122 123 124 125 126 127 128
{
	union nested_table *ntbl;
	int i;

	ntbl = rcu_dereference(*prev);
	if (ntbl)
		return ntbl;

	ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);

129 130
	if (ntbl && leaf) {
		for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
131
			INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
H
Herbert Xu 已提交
132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
	}

	rcu_assign_pointer(*prev, ntbl);

	return ntbl;
}

static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
						      size_t nbuckets,
						      gfp_t gfp)
{
	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
	struct bucket_table *tbl;
	size_t size;

	if (nbuckets < (1 << (shift + 1)))
		return NULL;

	size = sizeof(*tbl) + sizeof(tbl->buckets[0]);

	tbl = kzalloc(size, gfp);
	if (!tbl)
		return NULL;

	if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
157
				false)) {
H
Herbert Xu 已提交
158 159 160 161 162 163 164 165 166
		kfree(tbl);
		return NULL;
	}

	tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;

	return tbl;
}

167
static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
168 169
					       size_t nbuckets,
					       gfp_t gfp)
170
{
171
	struct bucket_table *tbl = NULL;
172
	size_t size, max_locks;
173
	int i;
174 175

	size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
176
	tbl = kvzalloc(size, gfp);
H
Herbert Xu 已提交
177 178 179

	size = nbuckets;

180
	if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
H
Herbert Xu 已提交
181 182 183
		tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
		nbuckets = 0;
	}
184

185 186 187
	if (tbl == NULL)
		return NULL;

H
Herbert Xu 已提交
188
	tbl->size = size;
189

190 191 192 193 194 195
	max_locks = size >> 1;
	if (tbl->nest)
		max_locks = min_t(size_t, max_locks, 1U << tbl->nest);

	if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks,
				   ht->p.locks_mul, gfp) < 0) {
196 197 198
		bucket_table_free(tbl);
		return NULL;
	}
199

200
	rcu_head_init(&tbl->rcu);
201 202
	INIT_LIST_HEAD(&tbl->walkers);

203
	tbl->hash_rnd = get_random_u32();
204

205
	for (i = 0; i < nbuckets; i++)
206
		INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
207

208
	return tbl;
209 210
}

211 212 213 214 215 216 217 218 219 220 221 222 223
static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
						  struct bucket_table *tbl)
{
	struct bucket_table *new_tbl;

	do {
		new_tbl = tbl;
		tbl = rht_dereference_rcu(tbl->future_tbl, ht);
	} while (tbl);

	return new_tbl;
}

224
static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
225
{
226
	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
227
	struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
H
Herbert Xu 已提交
228 229
	struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
	int err = -EAGAIN;
230 231
	struct rhash_head *head, *next, *entry;
	spinlock_t *new_bucket_lock;
232
	unsigned int new_hash;
233

H
Herbert Xu 已提交
234 235 236 237 238
	if (new_tbl->nest)
		goto out;

	err = -ENOENT;

239 240 241 242 243 244
	rht_for_each(entry, old_tbl, old_hash) {
		err = 0;
		next = rht_dereference_bucket(entry->next, old_tbl, old_hash);

		if (rht_is_a_nulls(next))
			break;
245

246 247
		pprev = &entry->next;
	}
248

249 250
	if (err)
		goto out;
251

252
	new_hash = head_hashfn(ht, new_tbl, entry);
253

254
	new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
255

256
	spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
257 258
	head = rht_dereference_bucket(new_tbl->buckets[new_hash],
				      new_tbl, new_hash);
259

260
	RCU_INIT_POINTER(entry->next, head);
261

262 263
	rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
	spin_unlock(new_bucket_lock);
264

265
	rcu_assign_pointer(*pprev, next);
266

267 268 269
out:
	return err;
}
270

H
Herbert Xu 已提交
271
static int rhashtable_rehash_chain(struct rhashtable *ht,
272
				    unsigned int old_hash)
273 274 275
{
	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
	spinlock_t *old_bucket_lock;
H
Herbert Xu 已提交
276
	int err;
277

278
	old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
279

280
	spin_lock_bh(old_bucket_lock);
H
Herbert Xu 已提交
281
	while (!(err = rhashtable_rehash_one(ht, old_hash)))
282
		;
H
Herbert Xu 已提交
283

284
	if (err == -ENOENT)
H
Herbert Xu 已提交
285
		err = 0;
286

287
	spin_unlock_bh(old_bucket_lock);
H
Herbert Xu 已提交
288 289

	return err;
290 291
}

292 293 294
static int rhashtable_rehash_attach(struct rhashtable *ht,
				    struct bucket_table *old_tbl,
				    struct bucket_table *new_tbl)
295
{
296 297
	/* Make insertions go into the new, empty table right away. Deletions
	 * and lookups will be attempted in both tables until we synchronize.
298 299
	 * As cmpxchg() provides strong barriers, we do not need
	 * rcu_assign_pointer().
300 301
	 */

302 303
	if (cmpxchg(&old_tbl->future_tbl, NULL, new_tbl) != NULL)
		return -EEXIST;
304 305 306 307 308 309 310 311 312

	return 0;
}

static int rhashtable_rehash_table(struct rhashtable *ht)
{
	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
	struct bucket_table *new_tbl;
	struct rhashtable_walker *walker;
313
	unsigned int old_hash;
H
Herbert Xu 已提交
314
	int err;
315 316 317 318 319

	new_tbl = rht_dereference(old_tbl->future_tbl, ht);
	if (!new_tbl)
		return 0;

H
Herbert Xu 已提交
320 321 322 323
	for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
		err = rhashtable_rehash_chain(ht, old_hash);
		if (err)
			return err;
E
Eric Dumazet 已提交
324
		cond_resched();
H
Herbert Xu 已提交
325
	}
326 327 328 329

	/* Publish the new table pointer. */
	rcu_assign_pointer(ht->tbl, new_tbl);

330
	spin_lock(&ht->lock);
331 332 333
	list_for_each_entry(walker, &old_tbl->walkers, list)
		walker->tbl = NULL;

334 335 336
	/* Wait for readers. All new readers will see the new
	 * table, and thus no references to the old table will
	 * remain.
337 338 339
	 * We do this inside the locked region so that
	 * rhashtable_walk_stop() can use rcu_head_after_call_rcu()
	 * to check if it should not re-link the table.
340
	 */
341
	call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
342
	spin_unlock(&ht->lock);
343 344

	return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
345 346
}

H
Herbert Xu 已提交
347 348 349
static int rhashtable_rehash_alloc(struct rhashtable *ht,
				   struct bucket_table *old_tbl,
				   unsigned int size)
350
{
H
Herbert Xu 已提交
351
	struct bucket_table *new_tbl;
352
	int err;
353 354 355

	ASSERT_RHT_MUTEX(ht);

H
Herbert Xu 已提交
356
	new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
357 358 359
	if (new_tbl == NULL)
		return -ENOMEM;

360 361 362 363 364
	err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
	if (err)
		bucket_table_free(new_tbl);

	return err;
365 366 367 368 369 370
}

/**
 * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
 * @ht:		the hash table to shrink
 *
H
Herbert Xu 已提交
371 372
 * This function shrinks the hash table to fit, i.e., the smallest
 * size would not cause it to expand right away automatically.
373
 *
374 375 376
 * The caller must ensure that no concurrent resizing occurs by holding
 * ht->mutex.
 *
377 378
 * The caller must ensure that no concurrent table mutations take place.
 * It is however valid to have concurrent lookups if they are RCU protected.
379 380 381
 *
 * It is valid to have concurrent insertions and deletions protected by per
 * bucket locks or concurrent RCU protected lookups and traversals.
382
 */
383
static int rhashtable_shrink(struct rhashtable *ht)
384
{
H
Herbert Xu 已提交
385
	struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
386 387
	unsigned int nelems = atomic_read(&ht->nelems);
	unsigned int size = 0;
388

389 390
	if (nelems)
		size = roundup_pow_of_two(nelems * 3 / 2);
H
Herbert Xu 已提交
391 392 393 394 395 396
	if (size < ht->p.min_size)
		size = ht->p.min_size;

	if (old_tbl->size <= size)
		return 0;

397 398 399
	if (rht_dereference(old_tbl->future_tbl, ht))
		return -EEXIST;

H
Herbert Xu 已提交
400
	return rhashtable_rehash_alloc(ht, old_tbl, size);
401 402
}

403 404 405 406
static void rht_deferred_worker(struct work_struct *work)
{
	struct rhashtable *ht;
	struct bucket_table *tbl;
407
	int err = 0;
408

409
	ht = container_of(work, struct rhashtable, run_work);
410
	mutex_lock(&ht->mutex);
411

412
	tbl = rht_dereference(ht->tbl, ht);
413
	tbl = rhashtable_last_table(ht, tbl);
414

415
	if (rht_grow_above_75(ht, tbl))
H
Herbert Xu 已提交
416
		err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
417
	else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
H
Herbert Xu 已提交
418 419 420
		err = rhashtable_shrink(ht);
	else if (tbl->nest)
		err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
421

422 423 424 425 426 427
	if (!err || err == -EEXIST) {
		int nerr;

		nerr = rhashtable_rehash_table(ht);
		err = err ?: nerr;
	}
428

429
	mutex_unlock(&ht->mutex);
430 431 432

	if (err)
		schedule_work(&ht->run_work);
433 434
}

H
Herbert Xu 已提交
435 436
static int rhashtable_insert_rehash(struct rhashtable *ht,
				    struct bucket_table *tbl)
437 438 439 440 441 442 443 444 445 446
{
	struct bucket_table *old_tbl;
	struct bucket_table *new_tbl;
	unsigned int size;
	int err;

	old_tbl = rht_dereference_rcu(ht->tbl, ht);

	size = tbl->size;

447 448
	err = -EBUSY;

449 450
	if (rht_grow_above_75(ht, tbl))
		size *= 2;
451 452
	/* Do not schedule more than one rehash */
	else if (old_tbl != tbl)
453 454 455
		goto fail;

	err = -ENOMEM;
456

457
	new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
458 459
	if (new_tbl == NULL)
		goto fail;
460 461 462 463 464 465 466 467 468 469

	err = rhashtable_rehash_attach(ht, tbl, new_tbl);
	if (err) {
		bucket_table_free(new_tbl);
		if (err == -EEXIST)
			err = 0;
	} else
		schedule_work(&ht->run_work);

	return err;
470 471 472

fail:
	/* Do not fail the insert if someone else did a rehash. */
473
	if (likely(rcu_access_pointer(tbl->future_tbl)))
474 475 476 477 478 479 480
		return 0;

	/* Schedule async rehash to retry allocation in process context. */
	if (err == -ENOMEM)
		schedule_work(&ht->run_work);

	return err;
481 482
}

H
Herbert Xu 已提交
483 484 485
static void *rhashtable_lookup_one(struct rhashtable *ht,
				   struct bucket_table *tbl, unsigned int hash,
				   const void *key, struct rhash_head *obj)
486
{
H
Herbert Xu 已提交
487 488 489 490 491
	struct rhashtable_compare_arg arg = {
		.ht = ht,
		.key = key,
	};
	struct rhash_head __rcu **pprev;
492
	struct rhash_head *head;
H
Herbert Xu 已提交
493
	int elasticity;
494

495
	elasticity = RHT_ELASTICITY;
H
Herbert Xu 已提交
496
	pprev = rht_bucket_var(tbl, hash);
497
	rht_for_each_from(head, *pprev, tbl, hash) {
H
Herbert Xu 已提交
498 499 500 501 502 503 504
		struct rhlist_head *list;
		struct rhlist_head *plist;

		elasticity--;
		if (!key ||
		    (ht->p.obj_cmpfn ?
		     ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
505 506
		     rhashtable_compare(&arg, rht_obj(ht, head)))) {
			pprev = &head->next;
H
Herbert Xu 已提交
507
			continue;
508
		}
H
Herbert Xu 已提交
509 510 511 512 513 514 515 516 517 518 519 520 521

		if (!ht->rhlist)
			return rht_obj(ht, head);

		list = container_of(obj, struct rhlist_head, rhead);
		plist = container_of(head, struct rhlist_head, rhead);

		RCU_INIT_POINTER(list->next, plist);
		head = rht_dereference_bucket(head->next, tbl, hash);
		RCU_INIT_POINTER(list->rhead.next, head);
		rcu_assign_pointer(*pprev, obj);

		return NULL;
522
	}
523

H
Herbert Xu 已提交
524 525 526 527 528 529 530 531 532 533 534 535
	if (elasticity <= 0)
		return ERR_PTR(-EAGAIN);

	return ERR_PTR(-ENOENT);
}

static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
						  struct bucket_table *tbl,
						  unsigned int hash,
						  struct rhash_head *obj,
						  void *data)
{
H
Herbert Xu 已提交
536
	struct rhash_head __rcu **pprev;
H
Herbert Xu 已提交
537 538 539 540 541
	struct bucket_table *new_tbl;
	struct rhash_head *head;

	if (!IS_ERR_OR_NULL(data))
		return ERR_PTR(-EEXIST);
542

H
Herbert Xu 已提交
543 544
	if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
		return ERR_CAST(data);
545

546
	new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
H
Herbert Xu 已提交
547 548 549 550 551 552 553 554 555 556 557
	if (new_tbl)
		return new_tbl;

	if (PTR_ERR(data) != -ENOENT)
		return ERR_CAST(data);

	if (unlikely(rht_grow_above_max(ht, tbl)))
		return ERR_PTR(-E2BIG);

	if (unlikely(rht_grow_above_100(ht, tbl)))
		return ERR_PTR(-EAGAIN);
558

H
Herbert Xu 已提交
559 560 561 562 563
	pprev = rht_bucket_insert(ht, tbl, hash);
	if (!pprev)
		return ERR_PTR(-ENOMEM);

	head = rht_dereference_bucket(*pprev, tbl, hash);
564 565

	RCU_INIT_POINTER(obj->next, head);
H
Herbert Xu 已提交
566 567 568 569 570 571
	if (ht->rhlist) {
		struct rhlist_head *list;

		list = container_of(obj, struct rhlist_head, rhead);
		RCU_INIT_POINTER(list->next, NULL);
	}
572

H
Herbert Xu 已提交
573
	rcu_assign_pointer(*pprev, obj);
574 575

	atomic_inc(&ht->nelems);
H
Herbert Xu 已提交
576 577
	if (rht_grow_above_75(ht, tbl))
		schedule_work(&ht->run_work);
578

H
Herbert Xu 已提交
579 580
	return NULL;
}
581

H
Herbert Xu 已提交
582 583 584 585 586 587 588 589
static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
				   struct rhash_head *obj)
{
	struct bucket_table *new_tbl;
	struct bucket_table *tbl;
	unsigned int hash;
	void *data;

590
	new_tbl = rcu_dereference(ht->tbl);
H
Herbert Xu 已提交
591

592
	do {
H
Herbert Xu 已提交
593 594
		tbl = new_tbl;
		hash = rht_head_hashfn(ht, tbl, obj, ht->p);
595
		spin_lock_bh(rht_bucket_lock(tbl, hash));
H
Herbert Xu 已提交
596 597 598 599 600 601

		data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
		new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
		if (PTR_ERR(new_tbl) != -EEXIST)
			data = ERR_CAST(new_tbl);

602 603
		spin_unlock_bh(rht_bucket_lock(tbl, hash));
	} while (!IS_ERR_OR_NULL(new_tbl));
H
Herbert Xu 已提交
604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623

	if (PTR_ERR(data) == -EAGAIN)
		data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
			       -EAGAIN);

	return data;
}

void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
			     struct rhash_head *obj)
{
	void *data;

	do {
		rcu_read_lock();
		data = rhashtable_try_insert(ht, key, obj);
		rcu_read_unlock();
	} while (PTR_ERR(data) == -EAGAIN);

	return data;
624 625 626
}
EXPORT_SYMBOL_GPL(rhashtable_insert_slow);

627
/**
628
 * rhashtable_walk_enter - Initialise an iterator
629 630 631 632 633 634 635 636 637 638 639 640 641
 * @ht:		Table to walk over
 * @iter:	Hash table Iterator
 *
 * This function prepares a hash table walk.
 *
 * Note that if you restart a walk after rhashtable_walk_stop you
 * may see the same object twice.  Also, you may miss objects if
 * there are removals in between rhashtable_walk_stop and the next
 * call to rhashtable_walk_start.
 *
 * For a completely stable walk you should construct your own data
 * structure outside the hash table.
 *
642 643 644
 * This function may be called from any process context, including
 * non-preemptable context, but cannot be called from softirq or
 * hardirq context.
645
 *
646
 * You must call rhashtable_walk_exit after this function returns.
647
 */
648
void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
649 650 651 652 653
{
	iter->ht = ht;
	iter->p = NULL;
	iter->slot = 0;
	iter->skip = 0;
T
Tom Herbert 已提交
654
	iter->end_of_table = 0;
655

656
	spin_lock(&ht->lock);
657
	iter->walker.tbl =
658
		rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
659
	list_add(&iter->walker.list, &iter->walker.tbl->walkers);
660
	spin_unlock(&ht->lock);
661
}
662
EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
663 664 665 666 667

/**
 * rhashtable_walk_exit - Free an iterator
 * @iter:	Hash table Iterator
 *
668
 * This function frees resources allocated by rhashtable_walk_enter.
669 670 671
 */
void rhashtable_walk_exit(struct rhashtable_iter *iter)
{
672
	spin_lock(&iter->ht->lock);
673 674
	if (iter->walker.tbl)
		list_del(&iter->walker.list);
675
	spin_unlock(&iter->ht->lock);
676 677 678 679
}
EXPORT_SYMBOL_GPL(rhashtable_walk_exit);

/**
680
 * rhashtable_walk_start_check - Start a hash table walk
681 682
 * @iter:	Hash table iterator
 *
683 684 685
 * Start a hash table walk at the current iterator position.  Note that we take
 * the RCU lock in all cases including when we return an error.  So you must
 * always call rhashtable_walk_stop to clean up.
686 687 688 689 690 691
 *
 * Returns zero if successful.
 *
 * Returns -EAGAIN if resize event occured.  Note that the iterator
 * will rewind back to the beginning and you may use it immediately
 * by calling rhashtable_walk_next.
692 693 694 695
 *
 * rhashtable_walk_start is defined as an inline variant that returns
 * void. This is preferred in cases where the caller would ignore
 * resize events and always continue.
696
 */
697
int rhashtable_walk_start_check(struct rhashtable_iter *iter)
698
	__acquires(RCU)
699
{
700
	struct rhashtable *ht = iter->ht;
701
	bool rhlist = ht->rhlist;
702

703
	rcu_read_lock();
704

705
	spin_lock(&ht->lock);
706 707
	if (iter->walker.tbl)
		list_del(&iter->walker.list);
708
	spin_unlock(&ht->lock);
709

710 711 712
	if (iter->end_of_table)
		return 0;
	if (!iter->walker.tbl) {
713
		iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
714 715
		iter->slot = 0;
		iter->skip = 0;
716 717 718
		return -EAGAIN;
	}

719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747
	if (iter->p && !rhlist) {
		/*
		 * We need to validate that 'p' is still in the table, and
		 * if so, update 'skip'
		 */
		struct rhash_head *p;
		int skip = 0;
		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
			skip++;
			if (p == iter->p) {
				iter->skip = skip;
				goto found;
			}
		}
		iter->p = NULL;
	} else if (iter->p && rhlist) {
		/* Need to validate that 'list' is still in the table, and
		 * if so, update 'skip' and 'p'.
		 */
		struct rhash_head *p;
		struct rhlist_head *list;
		int skip = 0;
		rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
			for (list = container_of(p, struct rhlist_head, rhead);
			     list;
			     list = rcu_dereference(list->next)) {
				skip++;
				if (list == iter->list) {
					iter->p = p;
748
					iter->skip = skip;
749 750 751 752 753 754 755
					goto found;
				}
			}
		}
		iter->p = NULL;
	}
found:
756 757
	return 0;
}
758
EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
759 760

/**
T
Tom Herbert 已提交
761 762 763
 * __rhashtable_walk_find_next - Find the next element in a table (or the first
 * one in case of a new walk).
 *
764 765
 * @iter:	Hash table iterator
 *
T
Tom Herbert 已提交
766
 * Returns the found object or NULL when the end of the table is reached.
767
 *
T
Tom Herbert 已提交
768
 * Returns -EAGAIN if resize event occurred.
769
 */
T
Tom Herbert 已提交
770
static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
771
{
772
	struct bucket_table *tbl = iter->walker.tbl;
H
Herbert Xu 已提交
773
	struct rhlist_head *list = iter->list;
774 775
	struct rhashtable *ht = iter->ht;
	struct rhash_head *p = iter->p;
H
Herbert Xu 已提交
776
	bool rhlist = ht->rhlist;
777

T
Tom Herbert 已提交
778 779
	if (!tbl)
		return NULL;
780 781 782 783 784

	for (; iter->slot < tbl->size; iter->slot++) {
		int skip = iter->skip;

		rht_for_each_rcu(p, tbl, iter->slot) {
H
Herbert Xu 已提交
785 786 787 788 789 790 791 792 793 794 795 796
			if (rhlist) {
				list = container_of(p, struct rhlist_head,
						    rhead);
				do {
					if (!skip)
						goto next;
					skip--;
					list = rcu_dereference(list->next);
				} while (list);

				continue;
			}
797 798 799 800 801 802 803 804 805
			if (!skip)
				break;
			skip--;
		}

next:
		if (!rht_is_a_nulls(p)) {
			iter->skip++;
			iter->p = p;
H
Herbert Xu 已提交
806 807
			iter->list = list;
			return rht_obj(ht, rhlist ? &list->rhead : p);
808 809 810 811 812
		}

		iter->skip = 0;
	}

813 814
	iter->p = NULL;

815 816 817
	/* Ensure we see any new tables. */
	smp_rmb();

818 819
	iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
	if (iter->walker.tbl) {
820 821 822
		iter->slot = 0;
		iter->skip = 0;
		return ERR_PTR(-EAGAIN);
T
Tom Herbert 已提交
823 824
	} else {
		iter->end_of_table = true;
825 826
	}

T
Thomas Graf 已提交
827
	return NULL;
828
}
T
Tom Herbert 已提交
829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869

/**
 * rhashtable_walk_next - Return the next object and advance the iterator
 * @iter:	Hash table iterator
 *
 * Note that you must call rhashtable_walk_stop when you are finished
 * with the walk.
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_next(struct rhashtable_iter *iter)
{
	struct rhlist_head *list = iter->list;
	struct rhashtable *ht = iter->ht;
	struct rhash_head *p = iter->p;
	bool rhlist = ht->rhlist;

	if (p) {
		if (!rhlist || !(list = rcu_dereference(list->next))) {
			p = rcu_dereference(p->next);
			list = container_of(p, struct rhlist_head, rhead);
		}
		if (!rht_is_a_nulls(p)) {
			iter->skip++;
			iter->p = p;
			iter->list = list;
			return rht_obj(ht, rhlist ? &list->rhead : p);
		}

		/* At the end of this slot, switch to next one and then find
		 * next entry from that point.
		 */
		iter->skip = 0;
		iter->slot++;
	}

	return __rhashtable_walk_find_next(iter);
}
870 871
EXPORT_SYMBOL_GPL(rhashtable_walk_next);

T
Tom Herbert 已提交
872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905
/**
 * rhashtable_walk_peek - Return the next object but don't advance the iterator
 * @iter:	Hash table iterator
 *
 * Returns the next object or NULL when the end of the table is reached.
 *
 * Returns -EAGAIN if resize event occurred.  Note that the iterator
 * will rewind back to the beginning and you may continue to use it.
 */
void *rhashtable_walk_peek(struct rhashtable_iter *iter)
{
	struct rhlist_head *list = iter->list;
	struct rhashtable *ht = iter->ht;
	struct rhash_head *p = iter->p;

	if (p)
		return rht_obj(ht, ht->rhlist ? &list->rhead : p);

	/* No object found in current iter, find next one in the table. */

	if (iter->skip) {
		/* A nonzero skip value points to the next entry in the table
		 * beyond that last one that was found. Decrement skip so
		 * we find the current value. __rhashtable_walk_find_next
		 * will restore the original value of skip assuming that
		 * the table hasn't changed.
		 */
		iter->skip--;
	}

	return __rhashtable_walk_find_next(iter);
}
EXPORT_SYMBOL_GPL(rhashtable_walk_peek);

906 907 908 909
/**
 * rhashtable_walk_stop - Finish a hash table walk
 * @iter:	Hash table iterator
 *
910 911
 * Finish a hash table walk.  Does not reset the iterator to the start of the
 * hash table.
912 913
 */
void rhashtable_walk_stop(struct rhashtable_iter *iter)
914
	__releases(RCU)
915
{
916
	struct rhashtable *ht;
917
	struct bucket_table *tbl = iter->walker.tbl;
918 919

	if (!tbl)
920
		goto out;
921 922 923

	ht = iter->ht;

924
	spin_lock(&ht->lock);
925 926
	if (rcu_head_after_call_rcu(&tbl->rcu, bucket_table_free_rcu))
		/* This bucket table is being freed, don't re-link it. */
927
		iter->walker.tbl = NULL;
928 929
	else
		list_add(&iter->walker.list, &tbl->walkers);
930
	spin_unlock(&ht->lock);
931

932 933
out:
	rcu_read_unlock();
934 935 936
}
EXPORT_SYMBOL_GPL(rhashtable_walk_stop);

937
static size_t rounded_hashtable_size(const struct rhashtable_params *params)
938
{
939 940 941 942 943 944 945 946 947 948
	size_t retsize;

	if (params->nelem_hint)
		retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
			      (unsigned long)params->min_size);
	else
		retsize = max(HASH_DEFAULT_SIZE,
			      (unsigned long)params->min_size);

	return retsize;
949 950
}

951 952 953 954 955
static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
{
	return jhash2(key, length, seed);
}

956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975
/**
 * rhashtable_init - initialize a new hash table
 * @ht:		hash table to be initialized
 * @params:	configuration parameters
 *
 * Initializes a new hash table based on the provided configuration
 * parameters. A table can be configured either with a variable or
 * fixed length key:
 *
 * Configuration Example 1: Fixed length keys
 * struct test_obj {
 *	int			key;
 *	void *			my_member;
 *	struct rhash_head	node;
 * };
 *
 * struct rhashtable_params params = {
 *	.head_offset = offsetof(struct test_obj, node),
 *	.key_offset = offsetof(struct test_obj, key),
 *	.key_len = sizeof(int),
976
 *	.hashfn = jhash,
977 978 979 980 981 982 983 984
 * };
 *
 * Configuration Example 2: Variable length keys
 * struct test_obj {
 *	[...]
 *	struct rhash_head	node;
 * };
 *
985
 * u32 my_hash_fn(const void *data, u32 len, u32 seed)
986 987 988 989 990 991 992 993
 * {
 *	struct test_obj *obj = data;
 *
 *	return [... hash ...];
 * }
 *
 * struct rhashtable_params params = {
 *	.head_offset = offsetof(struct test_obj, node),
994
 *	.hashfn = jhash,
995 996 997
 *	.obj_hashfn = my_hash_fn,
 * };
 */
998 999
int rhashtable_init(struct rhashtable *ht,
		    const struct rhashtable_params *params)
1000 1001 1002 1003
{
	struct bucket_table *tbl;
	size_t size;

1004
	if ((!params->key_len && !params->obj_hashfn) ||
1005
	    (params->obj_hashfn && !params->obj_cmpfn))
1006 1007
		return -EINVAL;

1008 1009
	memset(ht, 0, sizeof(*ht));
	mutex_init(&ht->mutex);
1010
	spin_lock_init(&ht->lock);
1011 1012
	memcpy(&ht->p, params, sizeof(*params));

1013 1014 1015
	if (params->min_size)
		ht->p.min_size = roundup_pow_of_two(params->min_size);

1016 1017
	/* Cap total entries at 2^31 to avoid nelems overflow. */
	ht->max_elems = 1u << 31;
1018 1019 1020 1021 1022 1023

	if (params->max_size) {
		ht->p.max_size = rounddown_pow_of_two(params->max_size);
		if (ht->p.max_size < ht->max_elems / 2)
			ht->max_elems = ht->p.max_size * 2;
	}
1024

1025
	ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
1026

1027
	size = rounded_hashtable_size(&ht->p);
1028

1029 1030 1031 1032 1033
	if (params->locks_mul)
		ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
	else
		ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;

1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
	ht->key_len = ht->p.key_len;
	if (!params->hashfn) {
		ht->p.hashfn = jhash;

		if (!(ht->key_len & (sizeof(u32) - 1))) {
			ht->key_len /= sizeof(u32);
			ht->p.hashfn = rhashtable_jhash2;
		}
	}

1044 1045 1046 1047 1048
	/*
	 * This is api initialization and thus we need to guarantee the
	 * initial rhashtable allocation. Upon failure, retry with the
	 * smallest possible size with __GFP_NOFAIL semantics.
	 */
1049
	tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
1050 1051 1052 1053
	if (unlikely(tbl == NULL)) {
		size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
		tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
	}
1054

1055
	atomic_set(&ht->nelems, 0);
1056

1057 1058
	RCU_INIT_POINTER(ht->tbl, tbl);

1059
	INIT_WORK(&ht->run_work, rht_deferred_worker);
1060

1061 1062 1063 1064
	return 0;
}
EXPORT_SYMBOL_GPL(rhashtable_init);

H
Herbert Xu 已提交
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102
/**
 * rhltable_init - initialize a new hash list table
 * @hlt:	hash list table to be initialized
 * @params:	configuration parameters
 *
 * Initializes a new hash list table.
 *
 * See documentation for rhashtable_init.
 */
int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
{
	int err;

	err = rhashtable_init(&hlt->ht, params);
	hlt->ht.rhlist = true;
	return err;
}
EXPORT_SYMBOL_GPL(rhltable_init);

static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
				void (*free_fn)(void *ptr, void *arg),
				void *arg)
{
	struct rhlist_head *list;

	if (!ht->rhlist) {
		free_fn(rht_obj(ht, obj), arg);
		return;
	}

	list = container_of(obj, struct rhlist_head, rhead);
	do {
		obj = &list->rhead;
		list = rht_dereference(list->next, ht);
		free_fn(rht_obj(ht, obj), arg);
	} while (list);
}

1103
/**
1104
 * rhashtable_free_and_destroy - free elements and destroy hash table
1105
 * @ht:		the hash table to destroy
1106 1107
 * @free_fn:	callback to release resources of element
 * @arg:	pointer passed to free_fn
1108
 *
1109 1110 1111 1112 1113 1114 1115 1116
 * Stops an eventual async resize. If defined, invokes free_fn for each
 * element to releasal resources. Please note that RCU protected
 * readers may still be accessing the elements. Releasing of resources
 * must occur in a compatible manner. Then frees the bucket array.
 *
 * This function will eventually sleep to wait for an async resize
 * to complete. The caller is responsible that no further write operations
 * occurs in parallel.
1117
 */
1118 1119 1120
void rhashtable_free_and_destroy(struct rhashtable *ht,
				 void (*free_fn)(void *ptr, void *arg),
				 void *arg)
1121
{
1122
	struct bucket_table *tbl, *next_tbl;
1123
	unsigned int i;
1124

1125
	cancel_work_sync(&ht->run_work);
1126

1127
	mutex_lock(&ht->mutex);
1128
	tbl = rht_dereference(ht->tbl, ht);
1129
restart:
1130 1131 1132 1133
	if (free_fn) {
		for (i = 0; i < tbl->size; i++) {
			struct rhash_head *pos, *next;

E
Eric Dumazet 已提交
1134
			cond_resched();
H
Herbert Xu 已提交
1135
			for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
1136 1137 1138 1139 1140 1141
			     next = !rht_is_a_nulls(pos) ?
					rht_dereference(pos->next, ht) : NULL;
			     !rht_is_a_nulls(pos);
			     pos = next,
			     next = !rht_is_a_nulls(pos) ?
					rht_dereference(pos->next, ht) : NULL)
H
Herbert Xu 已提交
1142
				rhashtable_free_one(ht, pos, free_fn, arg);
1143 1144 1145
		}
	}

1146
	next_tbl = rht_dereference(tbl->future_tbl, ht);
1147
	bucket_table_free(tbl);
1148 1149 1150 1151
	if (next_tbl) {
		tbl = next_tbl;
		goto restart;
	}
1152
	mutex_unlock(&ht->mutex);
1153
}
1154 1155 1156 1157 1158 1159
EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);

void rhashtable_destroy(struct rhashtable *ht)
{
	return rhashtable_free_and_destroy(ht, NULL, NULL);
}
1160
EXPORT_SYMBOL_GPL(rhashtable_destroy);
H
Herbert Xu 已提交
1161 1162 1163 1164 1165

struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
					    unsigned int hash)
{
	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
1166
	static struct rhash_head __rcu *rhnull;
H
Herbert Xu 已提交
1167 1168 1169 1170 1171 1172
	unsigned int index = hash & ((1 << tbl->nest) - 1);
	unsigned int size = tbl->size >> tbl->nest;
	unsigned int subhash = hash;
	union nested_table *ntbl;

	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
1173
	ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
H
Herbert Xu 已提交
1174 1175 1176 1177
	subhash >>= tbl->nest;

	while (ntbl && size > (1 << shift)) {
		index = subhash & ((1 << shift) - 1);
1178 1179
		ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
						  tbl, hash);
H
Herbert Xu 已提交
1180 1181 1182 1183
		size >>= shift;
		subhash >>= shift;
	}

1184 1185 1186
	if (!ntbl) {
		if (!rhnull)
			INIT_RHT_NULLS_HEAD(rhnull);
H
Herbert Xu 已提交
1187
		return &rhnull;
1188
	}
H
Herbert Xu 已提交
1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206

	return &ntbl[subhash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested);

struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
						   struct bucket_table *tbl,
						   unsigned int hash)
{
	const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
	unsigned int index = hash & ((1 << tbl->nest) - 1);
	unsigned int size = tbl->size >> tbl->nest;
	union nested_table *ntbl;

	ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
	hash >>= tbl->nest;
	ntbl = nested_table_alloc(ht, &ntbl[index].table,
1207
				  size <= (1 << shift));
H
Herbert Xu 已提交
1208 1209 1210 1211 1212 1213

	while (ntbl && size > (1 << shift)) {
		index = hash & ((1 << shift) - 1);
		size >>= shift;
		hash >>= shift;
		ntbl = nested_table_alloc(ht, &ntbl[index].table,
1214
					  size <= (1 << shift));
H
Herbert Xu 已提交
1215 1216 1217 1218 1219 1220 1221 1222 1223
	}

	if (!ntbl)
		return NULL;

	return &ntbl[hash].bucket;

}
EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);