direct.c 24.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

C
Chuck Lever 已提交
58 59
#include "iostat.h"

L
Linus Torvalds 已提交
60 61
#define NFSDBG_FACILITY		NFSDBG_VFS

62
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
L
Linus Torvalds 已提交
63 64 65 66 67 68 69
static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
70 71

	/* I/O parameters */
72 73
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
74
	struct nfs_open_context	*ctx;		/* file open context info */
75
	struct kiocb *		iocb;		/* controlling i/o request */
L
Linus Torvalds 已提交
76
	wait_queue_head_t	wait;		/* wait for i/o completion */
77
	struct inode *		inode;		/* target file of i/o */
78 79 80
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
L
Linus Torvalds 已提交
81 82
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
83 84 85 86 87

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
88
				error;		/* any reported error */
89 90 91 92 93 94 95

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
96 97
};

98 99 100
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
/**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
 */
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	struct dentry *dentry = iocb->ki_filp->f_dentry;

	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
			dentry->d_name.name, (long long) pos, nr_segs);

	return -EINVAL;
}

124
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
L
Linus Torvalds 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
{
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
141 142 143 144 145 146 147 148 149
		/*
		 * If we got fewer pages than expected from get_user_pages(),
		 * the user buffer runs off the end of a mapping; return EFAULT.
		 */
		if (result >= 0 && result < page_count) {
			nfs_free_user_pages(*pages, result, 0);
			*pages = NULL;
			result = -EFAULT;
		}
L
Linus Torvalds 已提交
150 151 152 153
	}
	return result;
}

154
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
L
Linus Torvalds 已提交
155 156 157
{
	int i;
	for (i = 0; i < npages; i++) {
158 159 160 161
		struct page *page = pages[i];
		if (do_dirty && !PageCompound(page))
			set_page_dirty_lock(page);
		page_cache_release(page);
L
Linus Torvalds 已提交
162 163 164 165
	}
	kfree(pages);
}

166 167 168 169 170 171 172 173 174 175 176
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
	init_waitqueue_head(&dreq->wait);
	INIT_LIST_HEAD(&dreq->list);
177
	INIT_LIST_HEAD(&dreq->rewrite_list);
178
	dreq->iocb = NULL;
179
	dreq->ctx = NULL;
180 181 182 183
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->error = 0;
184
	dreq->flags = 0;
185 186 187 188

	return dreq;
}

L
Linus Torvalds 已提交
189 190 191
static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
192 193 194

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
195 196 197
	kmem_cache_free(nfs_direct_cachep, dreq);
}

198 199 200 201 202
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
203
	ssize_t result = -EIOCBQUEUED;
204 205 206 207 208

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

209
	result = wait_event_interruptible(dreq->wait, (dreq->outstanding == 0));
210 211

	if (!result)
212
		result = dreq->error;
213
	if (!result)
214
		result = dreq->count;
215 216 217 218 219 220

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234
/*
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

	if (dreq->iocb) {
235
		long res = (long) dreq->error;
236
		if (!res)
237
			res = (long) dreq->count;
238 239 240 241 242 243 244
		aio_complete(dreq->iocb, res, 0);
	} else
		wake_up(&dreq->wait);

	kref_put(&dreq->kref, nfs_direct_req_release);
}

245
/*
L
Linus Torvalds 已提交
246 247 248 249
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
 */
250
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
L
Linus Torvalds 已提交
251 252 253
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
254
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
255

256
	dreq = nfs_direct_req_alloc();
L
Linus Torvalds 已提交
257 258 259 260 261
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
262
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
279
		dreq->outstanding++;
L
Linus Torvalds 已提交
280 281 282 283 284 285 286 287
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

T
Trond Myklebust 已提交
288
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
289
{
T
Trond Myklebust 已提交
290
	struct nfs_read_data *data = calldata;
L
Linus Torvalds 已提交
291 292
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

T
Trond Myklebust 已提交
293 294
	if (nfs_readpage_result(task, data) != 0)
		return;
295 296 297

	spin_lock(&dreq->lock);

T
Trond Myklebust 已提交
298
	if (likely(task->tk_status >= 0))
299
		dreq->count += data->res.count;
L
Linus Torvalds 已提交
300
	else
301 302 303 304 305 306
		dreq->error = task->tk_status;

	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
L
Linus Torvalds 已提交
307

308 309
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
L
Linus Torvalds 已提交
310 311
}

T
Trond Myklebust 已提交
312 313 314 315 316
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

317
/*
L
Linus Torvalds 已提交
318 319 320
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 */
321
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
322
{
323 324
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
L
Linus Torvalds 已提交
325 326
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
327 328
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
329
	size_t rsize = NFS_SERVER(inode)->rsize;
L
Linus Torvalds 已提交
330 331 332
	unsigned int curpage, pgbase;

	curpage = 0;
333
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
334 335
	do {
		struct nfs_read_data *data;
336
		size_t bytes;
L
Linus Torvalds 已提交
337 338 339 340 341 342 343 344 345 346 347 348

		bytes = rsize;
		if (count < rsize)
			bytes = count;

		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
349
		data->args.offset = pos;
L
Linus Torvalds 已提交
350 351 352 353 354 355 356
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

T
Trond Myklebust 已提交
357 358
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
L
Linus Torvalds 已提交
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

		dfprintk(VFS, "NFS: %4d initiated direct read call (req %s/%Ld, %u bytes @ offset %Lu)\n",
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

374
		pos += bytes;
L
Linus Torvalds 已提交
375 376 377 378 379 380 381 382
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
}

383
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
L
Linus Torvalds 已提交
384 385 386
{
	ssize_t result;
	sigset_t oldset;
387
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
388 389 390 391 392 393 394
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
	if (!dreq)
		return -ENOMEM;

395 396 397
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
L
Linus Torvalds 已提交
398 399
	dreq->pages = pages;
	dreq->npages = nr_pages;
C
Chuck Lever 已提交
400
	dreq->inode = inode;
401
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
402 403
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
404

C
Chuck Lever 已提交
405
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
406
	rpc_clnt_sigmask(clnt, &oldset);
407
	nfs_direct_read_schedule(dreq);
408
	result = nfs_direct_wait(dreq);
L
Linus Torvalds 已提交
409 410 411 412 413
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
{
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
	struct list_head *pos;

	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
	dreq->count = 0;

	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}

	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
}

static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
	struct nfs_write_data *data = dreq->commit_data;
	struct rpc_task *task = &data->task;

	data->inode = dreq->inode;
469
	data->cred = dreq->ctx->cred;
470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534

	data->args.fh = NFS_FH(data->inode);
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;

	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);

	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;

	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);

	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;

	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
			break;
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}

static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif

535
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
L
Linus Torvalds 已提交
536
{
537 538 539
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
540

541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
564
		dreq->outstanding++;
565 566 567 568
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}
569 570 571

	nfs_alloc_commit_data(dreq);

572 573 574 575 576 577 578 579 580 581 582 583 584
	kref_get(&dreq->kref);
	return dreq;
}

static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

585 586
	spin_lock(&dreq->lock);

587
	if (likely(status >= 0))
588
		dreq->count += data->res.count;
589
	else
590
		dreq->error = task->tk_status;
591

592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
				break;
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
		}
	}
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;

	spin_unlock(&dreq->lock);
}

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_release(void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	spin_lock(&dreq->lock);
621 622 623
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
624
	}
625 626
	spin_unlock(&dreq->lock);

627
	nfs_direct_write_complete(dreq, data->inode);
628 629 630 631
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
632
	.rpc_release = nfs_direct_write_release,
633 634 635 636 637 638
};

/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 */
639
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
640
{
641 642
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
643 644
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
645 646
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
647 648
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
L
Linus Torvalds 已提交
649 650

	curpage = 0;
651
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
652
	do {
653 654 655 656 657 658 659 660
		struct nfs_write_data *data;
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

		data = list_entry(list->next, struct nfs_write_data, pages);
661
		list_move_tail(&data->pages, &dreq->rewrite_list);
662 663 664 665 666

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
667
		data->args.offset = pos;
668 669 670 671 672
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
673
		data->res.verf = &data->verf;
674 675 676

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
677
		NFS_PROTO(inode)->write_setup(data, sync);
L
Linus Torvalds 已提交
678

679 680
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
L
Linus Torvalds 已提交
681 682

		lock_kernel();
683
		rpc_execute(&data->task);
L
Linus Torvalds 已提交
684 685
		unlock_kernel();

686 687 688 689 690 691
		dfprintk(VFS, "NFS: %4d initiated direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
L
Linus Torvalds 已提交
692

693
		pos += bytes;
694 695 696
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;
L
Linus Torvalds 已提交
697

698 699 700
		count -= bytes;
	} while (count != 0);
}
L
Linus Torvalds 已提交
701

702
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
703 704 705
{
	ssize_t result;
	sigset_t oldset;
706
	struct inode *inode = iocb->ki_filp->f_mapping->host;
707 708
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
709 710
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
L
Linus Torvalds 已提交
711

712
	dreq = nfs_direct_write_alloc(count, wsize);
713 714
	if (!dreq)
		return -ENOMEM;
715 716
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
L
Linus Torvalds 已提交
717

718 719 720
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
721 722
	dreq->pages = pages;
	dreq->npages = nr_pages;
723
	dreq->inode = inode;
724
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
725 726
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
727

728 729
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

730
	nfs_begin_data_update(inode);
L
Linus Torvalds 已提交
731

732
	rpc_clnt_sigmask(clnt, &oldset);
733
	nfs_direct_write_schedule(dreq, sync);
734
	result = nfs_direct_wait(dreq);
735
	rpc_clnt_sigunmask(clnt, &oldset);
L
Linus Torvalds 已提交
736

737
	return result;
L
Linus Torvalds 已提交
738 739 740 741 742 743
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
744 745
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
746 747 748 749 750 751
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
752
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
753 754
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
755
 *
L
Linus Torvalds 已提交
756 757 758 759 760
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
761
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
762 763
{
	ssize_t retval = -EINVAL;
764 765
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
766 767 768
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

769
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
770 771
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
772
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
773 774 775 776

	if (count < 0)
		goto out;
	retval = -EFAULT;
777
	if (!access_ok(VERIFY_WRITE, buf, count))
L
Linus Torvalds 已提交
778 779 780 781 782
		goto out;
	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
783 784 785
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
786

787 788 789 790 791 792 793 794
	page_count = nfs_get_user_pages(READ, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

795
	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
796
						pages, page_count);
L
Linus Torvalds 已提交
797
	if (retval > 0)
798
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
799 800 801 802 803 804 805 806 807

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
808 809
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
829
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
830
{
831
	ssize_t retval;
832 833
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
834 835 836
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

837
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
838
		file->f_dentry->d_parent->d_name.name,
839 840
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
841

842 843
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
844
		goto out;
845 846 847

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
848 849 850 851
		goto out;
	retval = 0;
	if (!count)
		goto out;
852 853

	retval = -EFAULT;
854
	if (!access_ok(VERIFY_READ, buf, count))
855
		goto out;
L
Linus Torvalds 已提交
856

T
Trond Myklebust 已提交
857 858 859
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
860

861 862 863 864 865 866 867 868
	page_count = nfs_get_user_pages(WRITE, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

869
	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
870
					pos, pages, page_count);
871 872 873 874 875 876 877 878 879

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
L
Linus Torvalds 已提交
880 881
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
882

L
Linus Torvalds 已提交
883
	if (retval > 0)
884
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
885 886 887 888 889

out:
	return retval;
}

890 891 892 893
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
894 895 896 897 898 899 900 901 902 903 904 905
int nfs_init_directcache(void)
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
						0, SLAB_RECLAIM_ACCOUNT,
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

906 907 908 909
/**
 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
910 911 912 913 914
void nfs_destroy_directcache(void)
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}