nfscache.c 15.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10
/*
 * Request reply cache. This is currently a global cache, but this may
 * change in the future and be a per-client cache.
 *
 * This code is heavily inspired by the 44BSD implementation, although
 * it does things a bit differently.
 *
 * Copyright (C) 1995, 1996 Olaf Kirch <okir@monad.swb.de>
 */

11
#include <linux/slab.h>
12
#include <linux/sunrpc/addr.h>
13
#include <linux/highmem.h>
14 15
#include <linux/log2.h>
#include <linux/hash.h>
16
#include <net/checksum.h>
17

18 19
#include "nfsd.h"
#include "cache.h"
L
Linus Torvalds 已提交
20

21 22
#define NFSDDBG_FACILITY	NFSDDBG_REPCACHE

23 24 25 26 27 28
/*
 * We use this value to determine the number of hash buckets from the max
 * cache size, the idea being that when the cache is at its maximum number
 * of entries, then this should be the average number of entries per bucket.
 */
#define TARGET_BUCKET_SIZE	64
L
Linus Torvalds 已提交
29

G
Greg Banks 已提交
30
static struct hlist_head *	cache_hash;
L
Linus Torvalds 已提交
31
static struct list_head 	lru_head;
32
static struct kmem_cache	*drc_slab;
33 34

/* max number of entries allowed in the cache */
35
static unsigned int		max_drc_entries;
L
Linus Torvalds 已提交
36

37 38 39
/* number of significant bits in the hash value */
static unsigned int		maskbits;

40 41 42 43 44 45 46 47 48 49 50
/*
 * Stats and other tracking of on the duplicate reply cache. All of these and
 * the "rc" fields in nfsdstats are protected by the cache_lock
 */

/* total number of entries */
static unsigned int		num_drc_entries;

/* cache misses due only to checksum comparison failures */
static unsigned int		payload_misses;

51 52 53
/* amount of memory (in bytes) currently consumed by the DRC */
static unsigned int		drc_mem_usage;

54 55 56 57 58 59
/* longest hash chain seen */
static unsigned int		longest_chain;

/* size of cache when we saw the longest hash chain */
static unsigned int		longest_chain_cachesize;

L
Linus Torvalds 已提交
60
static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
61
static void	cache_cleaner_func(struct work_struct *unused);
62 63 64 65
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
					    struct shrink_control *sc);
static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
					   struct shrink_control *sc);
66

67
static struct shrinker nfsd_reply_cache_shrinker = {
68 69
	.scan_objects = nfsd_reply_cache_scan,
	.count_objects = nfsd_reply_cache_count,
70 71
	.seeks	= 1,
};
L
Linus Torvalds 已提交
72

G
Greg Banks 已提交
73
/*
L
Linus Torvalds 已提交
74 75 76 77 78
 * locking for the reply cache:
 * A cache entry is "single use" if c_state == RC_INPROG
 * Otherwise, it when accessing _prev or _next, the lock must be held.
 */
static DEFINE_SPINLOCK(cache_lock);
79
static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
L
Linus Torvalds 已提交
80

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
/*
 * Put a cap on the size of the DRC based on the amount of available
 * low memory in the machine.
 *
 *  64MB:    8192
 * 128MB:   11585
 * 256MB:   16384
 * 512MB:   23170
 *   1GB:   32768
 *   2GB:   46340
 *   4GB:   65536
 *   8GB:   92681
 *  16GB:  131072
 *
 * ...with a hard cap of 256k entries. In the worst case, each entry will be
 * ~1k, so the above numbers should give a rough max of the amount of memory
 * used in k.
 */
static unsigned int
nfsd_cache_size_limit(void)
{
	unsigned int limit;
	unsigned long low_pages = totalram_pages - totalhigh_pages;

	limit = (16 * int_sqrt(low_pages)) << (PAGE_SHIFT-10);
	return min_t(unsigned int, limit, 256*1024);
}

109 110 111 112 113 114 115 116 117 118
/*
 * Compute the number of hash buckets we need. Divide the max cachesize by
 * the "target" max bucket size, and round up to next power of two.
 */
static unsigned int
nfsd_hashsize(unsigned int limit)
{
	return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE);
}

119 120
static struct svc_cacherep *
nfsd_reply_cache_alloc(void)
L
Linus Torvalds 已提交
121 122 123
{
	struct svc_cacherep	*rp;

124 125
	rp = kmem_cache_alloc(drc_slab, GFP_KERNEL);
	if (rp) {
L
Linus Torvalds 已提交
126 127
		rp->c_state = RC_UNUSED;
		rp->c_type = RC_NOCACHE;
128
		INIT_LIST_HEAD(&rp->c_lru);
L
Linus Torvalds 已提交
129 130
		INIT_HLIST_NODE(&rp->c_hash);
	}
131 132
	return rp;
}
L
Linus Torvalds 已提交
133

134 135 136
static void
nfsd_reply_cache_free_locked(struct svc_cacherep *rp)
{
137 138
	if (rp->c_type == RC_REPLBUFF && rp->c_replvec.iov_base) {
		drc_mem_usage -= rp->c_replvec.iov_len;
139
		kfree(rp->c_replvec.iov_base);
140
	}
141 142
	if (!hlist_unhashed(&rp->c_hash))
		hlist_del(&rp->c_hash);
143
	list_del(&rp->c_lru);
144
	--num_drc_entries;
145
	drc_mem_usage -= sizeof(*rp);
146 147 148
	kmem_cache_free(drc_slab, rp);
}

149 150 151 152 153 154 155 156
static void
nfsd_reply_cache_free(struct svc_cacherep *rp)
{
	spin_lock(&cache_lock);
	nfsd_reply_cache_free_locked(rp);
	spin_unlock(&cache_lock);
}

157 158
int nfsd_reply_cache_init(void)
{
159 160
	unsigned int hashsize;

161 162 163
	INIT_LIST_HEAD(&lru_head);
	max_drc_entries = nfsd_cache_size_limit();
	num_drc_entries = 0;
164 165
	hashsize = nfsd_hashsize(max_drc_entries);
	maskbits = ilog2(hashsize);
166

167
	register_shrinker(&nfsd_reply_cache_shrinker);
168 169 170 171 172
	drc_slab = kmem_cache_create("nfsd_drc", sizeof(struct svc_cacherep),
					0, 0, NULL);
	if (!drc_slab)
		goto out_nomem;

173
	cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL);
G
Greg Banks 已提交
174
	if (!cache_hash)
175
		goto out_nomem;
L
Linus Torvalds 已提交
176

177 178 179 180 181
	return 0;
out_nomem:
	printk(KERN_ERR "nfsd: failed to allocate reply cache\n");
	nfsd_reply_cache_shutdown();
	return -ENOMEM;
L
Linus Torvalds 已提交
182 183
}

184
void nfsd_reply_cache_shutdown(void)
L
Linus Torvalds 已提交
185 186 187
{
	struct svc_cacherep	*rp;

188
	unregister_shrinker(&nfsd_reply_cache_shrinker);
189 190
	cancel_delayed_work_sync(&cache_cleaner);

L
Linus Torvalds 已提交
191 192
	while (!list_empty(&lru_head)) {
		rp = list_entry(lru_head.next, struct svc_cacherep, c_lru);
193
		nfsd_reply_cache_free_locked(rp);
L
Linus Torvalds 已提交
194 195
	}

G
Greg Banks 已提交
196 197
	kfree (cache_hash);
	cache_hash = NULL;
198 199 200 201 202

	if (drc_slab) {
		kmem_cache_destroy(drc_slab);
		drc_slab = NULL;
	}
L
Linus Torvalds 已提交
203 204 205
}

/*
206 207
 * Move cache entry to end of LRU list, and queue the cleaner to run if it's
 * not already scheduled.
L
Linus Torvalds 已提交
208 209 210 211
 */
static void
lru_put_end(struct svc_cacherep *rp)
{
212
	rp->c_timestamp = jiffies;
A
Akinobu Mita 已提交
213
	list_move_tail(&rp->c_lru, &lru_head);
214
	schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
L
Linus Torvalds 已提交
215 216 217 218 219 220 221 222 223
}

/*
 * Move a cache entry from one hash list to another
 */
static void
hash_refile(struct svc_cacherep *rp)
{
	hlist_del_init(&rp->c_hash);
224 225 226 227 228 229
	/*
	 * No point in byte swapping c_xid since we're just using it to pick
	 * a hash bucket.
	 */
	hlist_add_head(&rp->c_hash, cache_hash +
			hash_32((__force u32)rp->c_xid, maskbits));
L
Linus Torvalds 已提交
230 231
}

232 233 234 235
/*
 * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
 * Also prune the oldest ones when the total exceeds the max number of entries.
 */
236
static long
237 238 239
prune_cache_entries(void)
{
	struct svc_cacherep *rp, *tmp;
240
	long freed = 0;
241 242

	list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
243 244 245 246 247 248 249 250
		/*
		 * Don't free entries attached to calls that are still
		 * in-progress, but do keep scanning the list.
		 */
		if (rp->c_state == RC_INPROG)
			continue;
		if (num_drc_entries <= max_drc_entries &&
		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
251 252
			break;
		nfsd_reply_cache_free_locked(rp);
253
		freed++;
254 255 256 257 258 259 260 261 262 263 264 265
	}

	/*
	 * Conditionally rearm the job. If we cleaned out the list, then
	 * cancel any pending run (since there won't be any work to do).
	 * Otherwise, we rearm the job or modify the existing one to run in
	 * RC_EXPIRE since we just ran the pruner.
	 */
	if (list_empty(&lru_head))
		cancel_delayed_work(&cache_cleaner);
	else
		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
266
	return freed;
267 268 269 270 271 272 273 274 275 276
}

static void
cache_cleaner_func(struct work_struct *unused)
{
	spin_lock(&cache_lock);
	prune_cache_entries();
	spin_unlock(&cache_lock);
}

277 278
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
279
{
280
	unsigned long num;
281 282 283 284 285 286 287 288

	spin_lock(&cache_lock);
	num = num_drc_entries;
	spin_unlock(&cache_lock);

	return num;
}

289 290 291 292 293 294 295 296 297 298
static unsigned long
nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
{
	unsigned long freed;

	spin_lock(&cache_lock);
	freed = prune_cache_entries();
	spin_unlock(&cache_lock);
	return freed;
}
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322
/*
 * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
 */
static __wsum
nfsd_cache_csum(struct svc_rqst *rqstp)
{
	int idx;
	unsigned int base;
	__wsum csum;
	struct xdr_buf *buf = &rqstp->rq_arg;
	const unsigned char *p = buf->head[0].iov_base;
	size_t csum_len = min_t(size_t, buf->head[0].iov_len + buf->page_len,
				RC_CSUMLEN);
	size_t len = min(buf->head[0].iov_len, csum_len);

	/* rq_arg.head first */
	csum = csum_partial(p, len, 0);
	csum_len -= len;

	/* Continue into page array */
	idx = buf->page_base / PAGE_SIZE;
	base = buf->page_base & ~PAGE_MASK;
	while (csum_len) {
		p = page_address(buf->pages[idx]) + base;
323
		len = min_t(size_t, PAGE_SIZE - base, csum_len);
324 325 326 327 328 329 330 331
		csum = csum_partial(p, len, csum);
		csum_len -= len;
		base = 0;
		++idx;
	}
	return csum;
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
static bool
nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp)
{
	/* Check RPC header info first */
	if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc ||
	    rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers ||
	    rqstp->rq_arg.len != rp->c_len ||
	    !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) ||
	    rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr))
		return false;

	/* compare checksum of NFS data */
	if (csum != rp->c_csum) {
		++payload_misses;
		return false;
	}

	return true;
}

352 353 354 355 356 357
/*
 * Search the request hash for an entry that matches the given rqstp.
 * Must be called with cache_lock held. Returns the found entry or
 * NULL on failure.
 */
static struct svc_cacherep *
358
nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum)
359
{
360
	struct svc_cacherep	*rp, *ret = NULL;
361
	struct hlist_head 	*rh;
362
	unsigned int		entries = 0;
363

364 365 366 367 368
	/*
	 * No point in byte swapping rq_xid since we're just using it to pick
	 * a hash bucket.
	 */
	rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)];
369
	hlist_for_each_entry(rp, rh, c_hash) {
370 371 372 373 374 375 376 377 378 379 380 381 382 383 384
		++entries;
		if (nfsd_cache_match(rqstp, csum, rp)) {
			ret = rp;
			break;
		}
	}

	/* tally hash chain length stats */
	if (entries > longest_chain) {
		longest_chain = entries;
		longest_chain_cachesize = num_drc_entries;
	} else if (entries == longest_chain) {
		/* prefer to keep the smallest cachesize possible here */
		longest_chain_cachesize = min(longest_chain_cachesize,
						num_drc_entries);
385
	}
386 387

	return ret;
388 389
}

L
Linus Torvalds 已提交
390 391
/*
 * Try to find an entry matching the current call in the cache. When none
392 393 394 395
 * is found, we try to grab the oldest expired entry off the LRU list. If
 * a suitable one isn't there, then drop the cache_lock and allocate a
 * new one, then search again in case one got inserted while this thread
 * didn't hold the lock.
L
Linus Torvalds 已提交
396 397
 */
int
398
nfsd_cache_lookup(struct svc_rqst *rqstp)
L
Linus Torvalds 已提交
399
{
400
	struct svc_cacherep	*rp, *found;
401 402
	__be32			xid = rqstp->rq_xid;
	u32			proto =  rqstp->rq_prot,
L
Linus Torvalds 已提交
403 404
				vers = rqstp->rq_vers,
				proc = rqstp->rq_proc;
405
	__wsum			csum;
L
Linus Torvalds 已提交
406
	unsigned long		age;
407
	int type = rqstp->rq_cachetype;
408
	int rtn = RC_DOIT;
L
Linus Torvalds 已提交
409 410

	rqstp->rq_cacherep = NULL;
411
	if (type == RC_NOCACHE) {
L
Linus Torvalds 已提交
412
		nfsdstats.rcnocache++;
413
		return rtn;
L
Linus Torvalds 已提交
414 415
	}

416 417
	csum = nfsd_cache_csum(rqstp);

418 419
	/*
	 * Since the common case is a cache miss followed by an insert,
420
	 * preallocate an entry.
421
	 */
422 423
	rp = nfsd_reply_cache_alloc();
	spin_lock(&cache_lock);
424
	if (likely(rp)) {
425
		++num_drc_entries;
426 427
		drc_mem_usage += sizeof(*rp);
	}
428

429 430 431
	/* go ahead and prune the cache */
	prune_cache_entries();

432
	found = nfsd_cache_search(rqstp, csum);
433
	if (found) {
434 435
		if (likely(rp))
			nfsd_reply_cache_free_locked(rp);
436 437
		rp = found;
		goto found_entry;
L
Linus Torvalds 已提交
438 439
	}

440 441 442 443 444
	if (!rp) {
		dprintk("nfsd: unable to allocate DRC entry!\n");
		goto out;
	}

445
	nfsdstats.rcmisses++;
L
Linus Torvalds 已提交
446 447 448 449
	rqstp->rq_cacherep = rp;
	rp->c_state = RC_INPROG;
	rp->c_xid = xid;
	rp->c_proc = proc;
450 451
	rpc_copy_addr((struct sockaddr *)&rp->c_addr, svc_addr(rqstp));
	rpc_set_port((struct sockaddr *)&rp->c_addr, rpc_get_port(svc_addr(rqstp)));
L
Linus Torvalds 已提交
452 453
	rp->c_prot = proto;
	rp->c_vers = vers;
454 455
	rp->c_len = rqstp->rq_arg.len;
	rp->c_csum = csum;
L
Linus Torvalds 已提交
456 457

	hash_refile(rp);
458
	lru_put_end(rp);
L
Linus Torvalds 已提交
459 460 461

	/* release any buffer */
	if (rp->c_type == RC_REPLBUFF) {
462
		drc_mem_usage -= rp->c_replvec.iov_len;
L
Linus Torvalds 已提交
463 464 465 466 467 468 469 470 471
		kfree(rp->c_replvec.iov_base);
		rp->c_replvec.iov_base = NULL;
	}
	rp->c_type = RC_NOCACHE;
 out:
	spin_unlock(&cache_lock);
	return rtn;

found_entry:
472
	nfsdstats.rchits++;
L
Linus Torvalds 已提交
473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
	/* We found a matching entry which is either in progress or done. */
	age = jiffies - rp->c_timestamp;
	lru_put_end(rp);

	rtn = RC_DROPIT;
	/* Request being processed or excessive rexmits */
	if (rp->c_state == RC_INPROG || age < RC_DELAY)
		goto out;

	/* From the hall of fame of impractical attacks:
	 * Is this a user who tries to snoop on the cache? */
	rtn = RC_DOIT;
	if (!rqstp->rq_secure && rp->c_secure)
		goto out;

	/* Compose RPC reply header */
	switch (rp->c_type) {
	case RC_NOCACHE:
		break;
	case RC_REPLSTAT:
		svc_putu32(&rqstp->rq_res.head[0], rp->c_replstat);
		rtn = RC_REPLY;
		break;
	case RC_REPLBUFF:
		if (!nfsd_cache_append(rqstp, &rp->c_replvec))
			goto out;	/* should not happen */
		rtn = RC_REPLY;
		break;
	default:
		printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
503
		nfsd_reply_cache_free_locked(rp);
L
Linus Torvalds 已提交
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525
	}

	goto out;
}

/*
 * Update a cache entry. This is called from nfsd_dispatch when
 * the procedure has been executed and the complete reply is in
 * rqstp->rq_res.
 *
 * We're copying around data here rather than swapping buffers because
 * the toplevel loop requires max-sized buffers, which would be a waste
 * of memory for a cache with a max reply size of 100 bytes (diropokres).
 *
 * If we should start to use different types of cache entries tailored
 * specifically for attrstat and fh's, we may save even more space.
 *
 * Also note that a cachetype of RC_NOCACHE can legally be passed when
 * nfsd failed to encode a reply that otherwise would have been cached.
 * In this case, nfsd_cache_update is called with statp == NULL.
 */
void
526
nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
L
Linus Torvalds 已提交
527
{
528
	struct svc_cacherep *rp = rqstp->rq_cacherep;
L
Linus Torvalds 已提交
529 530
	struct kvec	*resv = &rqstp->rq_res.head[0], *cachv;
	int		len;
531
	size_t		bufsize = 0;
L
Linus Torvalds 已提交
532

533
	if (!rp)
L
Linus Torvalds 已提交
534 535 536 537
		return;

	len = resv->iov_len - ((char*)statp - (char*)resv->iov_base);
	len >>= 2;
G
Greg Banks 已提交
538

L
Linus Torvalds 已提交
539 540
	/* Don't cache excessive amounts of data and XDR failures */
	if (!statp || len > (256 >> 2)) {
541
		nfsd_reply_cache_free(rp);
L
Linus Torvalds 已提交
542 543 544 545 546 547 548 549 550 551 552
		return;
	}

	switch (cachetype) {
	case RC_REPLSTAT:
		if (len != 1)
			printk("nfsd: RC_REPLSTAT/reply len %d!\n",len);
		rp->c_replstat = *statp;
		break;
	case RC_REPLBUFF:
		cachv = &rp->c_replvec;
553 554
		bufsize = len << 2;
		cachv->iov_base = kmalloc(bufsize, GFP_KERNEL);
L
Linus Torvalds 已提交
555
		if (!cachv->iov_base) {
556
			nfsd_reply_cache_free(rp);
L
Linus Torvalds 已提交
557 558
			return;
		}
559 560
		cachv->iov_len = bufsize;
		memcpy(cachv->iov_base, statp, bufsize);
L
Linus Torvalds 已提交
561
		break;
562 563 564
	case RC_NOCACHE:
		nfsd_reply_cache_free(rp);
		return;
L
Linus Torvalds 已提交
565 566
	}
	spin_lock(&cache_lock);
567
	drc_mem_usage += bufsize;
L
Linus Torvalds 已提交
568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594
	lru_put_end(rp);
	rp->c_secure = rqstp->rq_secure;
	rp->c_type = cachetype;
	rp->c_state = RC_DONE;
	spin_unlock(&cache_lock);
	return;
}

/*
 * Copy cached reply to current reply buffer. Should always fit.
 * FIXME as reply is in a page, we should just attach the page, and
 * keep a refcount....
 */
static int
nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data)
{
	struct kvec	*vec = &rqstp->rq_res.head[0];

	if (vec->iov_len + data->iov_len > PAGE_SIZE) {
		printk(KERN_WARNING "nfsd: cached reply too large (%Zd).\n",
				data->iov_len);
		return 0;
	}
	memcpy((char*)vec->iov_base + vec->iov_len, data->iov_base, data->iov_len);
	vec->iov_len += data->iov_len;
	return 1;
}
595 596 597 598 599 600 601 602 603 604 605

/*
 * Note that fields may be added, removed or reordered in the future. Programs
 * scraping this file for info should test the labels to ensure they're
 * getting the correct field.
 */
static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
{
	spin_lock(&cache_lock);
	seq_printf(m, "max entries:           %u\n", max_drc_entries);
	seq_printf(m, "num entries:           %u\n", num_drc_entries);
606
	seq_printf(m, "hash buckets:          %u\n", 1 << maskbits);
607 608 609 610 611
	seq_printf(m, "mem usage:             %u\n", drc_mem_usage);
	seq_printf(m, "cache hits:            %u\n", nfsdstats.rchits);
	seq_printf(m, "cache misses:          %u\n", nfsdstats.rcmisses);
	seq_printf(m, "not cached:            %u\n", nfsdstats.rcnocache);
	seq_printf(m, "payload misses:        %u\n", payload_misses);
612 613
	seq_printf(m, "longest chain len:     %u\n", longest_chain);
	seq_printf(m, "cachesize at longest:  %u\n", longest_chain_cachesize);
614 615 616 617 618 619 620 621
	spin_unlock(&cache_lock);
	return 0;
}

int nfsd_reply_cache_stats_open(struct inode *inode, struct file *file)
{
	return single_open(file, nfsd_reply_cache_stats_show, NULL);
}