direct.c 24.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

C
Chuck Lever 已提交
58 59
#include "iostat.h"

L
Linus Torvalds 已提交
60 61 62 63 64 65 66 67 68
#define NFSDBG_FACILITY		NFSDBG_VFS

static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
69 70

	/* I/O parameters */
71 72
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
73
	struct nfs_open_context	*ctx;		/* file open context info */
74
	struct kiocb *		iocb;		/* controlling i/o request */
L
Linus Torvalds 已提交
75
	wait_queue_head_t	wait;		/* wait for i/o completion */
76
	struct inode *		inode;		/* target file of i/o */
77 78 79
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
L
Linus Torvalds 已提交
80 81
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
82 83 84 85 86

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
87
				error;		/* any reported error */
88 89 90 91 92 93 94

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
95 96
};

97 98 99
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);

100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
/**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
 */
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	struct dentry *dentry = iocb->ki_filp->f_dentry;

	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
			dentry->d_name.name, (long long) pos, nr_segs);

	return -EINVAL;
}

123 124 125 126 127 128 129 130 131 132 133 134
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
{
	int i;
	for (i = 0; i < npages; i++) {
		struct page *page = pages[i];
		if (do_dirty && !PageCompound(page))
			set_page_dirty_lock(page);
		page_cache_release(page);
	}
	kfree(pages);
}

135
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
L
Linus Torvalds 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
{
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
152 153 154 155 156 157 158 159 160 161 162
		if (result != page_count) {
			/*
			 * If we got fewer pages than expected from
			 * get_user_pages(), the user buffer runs off the
			 * end of a mapping; return EFAULT.
			 */
			if (result >= 0) {
				nfs_free_user_pages(*pages, result, 0);
				result = -EFAULT;
			} else
				kfree(*pages);
163 164
			*pages = NULL;
		}
L
Linus Torvalds 已提交
165 166 167 168
	}
	return result;
}

169 170 171 172 173 174 175 176 177 178 179
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
	init_waitqueue_head(&dreq->wait);
	INIT_LIST_HEAD(&dreq->list);
180
	INIT_LIST_HEAD(&dreq->rewrite_list);
181
	dreq->iocb = NULL;
182
	dreq->ctx = NULL;
183 184 185 186
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->error = 0;
187
	dreq->flags = 0;
188 189 190 191

	return dreq;
}

L
Linus Torvalds 已提交
192 193 194
static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
195 196 197

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
198 199 200
	kmem_cache_free(nfs_direct_cachep, dreq);
}

201 202 203 204 205
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
206
	ssize_t result = -EIOCBQUEUED;
207 208 209 210 211

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

212
	result = wait_event_interruptible(dreq->wait, (dreq->outstanding == 0));
213 214

	if (!result)
215
		result = dreq->error;
216
	if (!result)
217
		result = dreq->count;
218 219 220 221 222 223

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

224 225 226 227 228 229 230 231 232 233 234 235 236 237
/*
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

	if (dreq->iocb) {
238
		long res = (long) dreq->error;
239
		if (!res)
240
			res = (long) dreq->count;
241 242 243 244 245 246 247
		aio_complete(dreq->iocb, res, 0);
	} else
		wake_up(&dreq->wait);

	kref_put(&dreq->kref, nfs_direct_req_release);
}

248
/*
L
Linus Torvalds 已提交
249 250 251 252
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
 */
253
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
L
Linus Torvalds 已提交
254 255 256
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
257
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
258

259
	dreq = nfs_direct_req_alloc();
L
Linus Torvalds 已提交
260 261 262 263 264
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
265
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
L
Linus Torvalds 已提交
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
282
		dreq->outstanding++;
L
Linus Torvalds 已提交
283 284 285 286 287 288 289 290
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

T
Trond Myklebust 已提交
291
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
292
{
T
Trond Myklebust 已提交
293
	struct nfs_read_data *data = calldata;
L
Linus Torvalds 已提交
294 295
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

T
Trond Myklebust 已提交
296 297
	if (nfs_readpage_result(task, data) != 0)
		return;
298 299 300

	spin_lock(&dreq->lock);

T
Trond Myklebust 已提交
301
	if (likely(task->tk_status >= 0))
302
		dreq->count += data->res.count;
L
Linus Torvalds 已提交
303
	else
304 305 306 307 308 309
		dreq->error = task->tk_status;

	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
L
Linus Torvalds 已提交
310

311 312
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
L
Linus Torvalds 已提交
313 314
}

T
Trond Myklebust 已提交
315 316 317 318 319
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

320
/*
L
Linus Torvalds 已提交
321 322 323
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 */
324
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
325
{
326 327
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
L
Linus Torvalds 已提交
328 329
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
330 331
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
332
	size_t rsize = NFS_SERVER(inode)->rsize;
L
Linus Torvalds 已提交
333 334 335
	unsigned int curpage, pgbase;

	curpage = 0;
336
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
337 338
	do {
		struct nfs_read_data *data;
339
		size_t bytes;
L
Linus Torvalds 已提交
340 341 342 343 344

		bytes = rsize;
		if (count < rsize)
			bytes = count;

345
		BUG_ON(list_empty(list));
L
Linus Torvalds 已提交
346 347 348 349 350 351 352
		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
353
		data->args.offset = pos;
L
Linus Torvalds 已提交
354 355 356 357 358 359 360
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

T
Trond Myklebust 已提交
361 362
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
L
Linus Torvalds 已提交
363 364 365 366 367 368 369 370
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

371
		dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
L
Linus Torvalds 已提交
372 373 374 375 376 377
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

378
		pos += bytes;
L
Linus Torvalds 已提交
379 380 381 382 383 384
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
385
	BUG_ON(!list_empty(list));
L
Linus Torvalds 已提交
386 387
}

388
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
L
Linus Torvalds 已提交
389 390 391
{
	ssize_t result;
	sigset_t oldset;
392
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
393 394 395 396 397 398 399
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
	if (!dreq)
		return -ENOMEM;

400 401 402
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
L
Linus Torvalds 已提交
403 404
	dreq->pages = pages;
	dreq->npages = nr_pages;
C
Chuck Lever 已提交
405
	dreq->inode = inode;
406
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
407 408
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
409

C
Chuck Lever 已提交
410
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
411
	rpc_clnt_sigmask(clnt, &oldset);
412
	nfs_direct_read_schedule(dreq);
413
	result = nfs_direct_wait(dreq);
L
Linus Torvalds 已提交
414 415 416 417 418
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
{
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
	struct list_head *pos;

	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
	dreq->count = 0;

	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}

	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
}

static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
	struct nfs_write_data *data = dreq->commit_data;
	struct rpc_task *task = &data->task;

	data->inode = dreq->inode;
474
	data->cred = dreq->ctx->cred;
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539

	data->args.fh = NFS_FH(data->inode);
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;

	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);

	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;

	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);

	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;

	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
			break;
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}

static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif

540
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
L
Linus Torvalds 已提交
541
{
542 543 544
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
545

546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568
	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
569
		dreq->outstanding++;
570 571 572 573
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}
574 575 576

	nfs_alloc_commit_data(dreq);

577 578 579 580 581 582 583 584 585 586 587 588 589
	kref_get(&dreq->kref);
	return dreq;
}

static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

590 591
	spin_lock(&dreq->lock);

592
	if (likely(status >= 0))
593
		dreq->count += data->res.count;
594
	else
595
		dreq->error = task->tk_status;
596

597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
				break;
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
		}
	}
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;

	spin_unlock(&dreq->lock);
}

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_release(void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	spin_lock(&dreq->lock);
626 627 628
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
629
	}
630 631
	spin_unlock(&dreq->lock);

632
	nfs_direct_write_complete(dreq, data->inode);
633 634 635 636
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
637
	.rpc_release = nfs_direct_write_release,
638 639 640 641 642 643
};

/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 */
644
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
645
{
646 647
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
648 649
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
650 651
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
652 653
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
L
Linus Torvalds 已提交
654 655

	curpage = 0;
656
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
657
	do {
658 659 660 661 662 663 664
		struct nfs_write_data *data;
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

665
		BUG_ON(list_empty(list));
666
		data = list_entry(list->next, struct nfs_write_data, pages);
667
		list_move_tail(&data->pages, &dreq->rewrite_list);
668 669 670 671 672

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
673
		data->args.offset = pos;
674 675 676 677 678
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
679
		data->res.verf = &data->verf;
680 681 682

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
683
		NFS_PROTO(inode)->write_setup(data, sync);
L
Linus Torvalds 已提交
684

685 686
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
L
Linus Torvalds 已提交
687 688

		lock_kernel();
689
		rpc_execute(&data->task);
L
Linus Torvalds 已提交
690 691
		unlock_kernel();

692
		dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
693 694 695 696 697
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
L
Linus Torvalds 已提交
698

699
		pos += bytes;
700 701 702
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;
L
Linus Torvalds 已提交
703

704 705
		count -= bytes;
	} while (count != 0);
706
	BUG_ON(!list_empty(list));
707
}
L
Linus Torvalds 已提交
708

709
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
710 711 712
{
	ssize_t result;
	sigset_t oldset;
713
	struct inode *inode = iocb->ki_filp->f_mapping->host;
714 715
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
716 717
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
L
Linus Torvalds 已提交
718

719
	dreq = nfs_direct_write_alloc(count, wsize);
720 721
	if (!dreq)
		return -ENOMEM;
722 723
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
L
Linus Torvalds 已提交
724

725 726 727
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
728 729
	dreq->pages = pages;
	dreq->npages = nr_pages;
730
	dreq->inode = inode;
731
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
732 733
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
734

735 736
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

737
	nfs_begin_data_update(inode);
L
Linus Torvalds 已提交
738

739
	rpc_clnt_sigmask(clnt, &oldset);
740
	nfs_direct_write_schedule(dreq, sync);
741
	result = nfs_direct_wait(dreq);
742
	rpc_clnt_sigunmask(clnt, &oldset);
L
Linus Torvalds 已提交
743

744
	return result;
L
Linus Torvalds 已提交
745 746 747 748 749 750
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
751 752
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
753 754 755 756 757 758
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
759
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
760 761
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
762
 *
L
Linus Torvalds 已提交
763 764 765 766 767
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
768
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
769 770
{
	ssize_t retval = -EINVAL;
771 772
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
773 774 775
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

776
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
777 778
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
779
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
780 781 782 783

	if (count < 0)
		goto out;
	retval = -EFAULT;
784
	if (!access_ok(VERIFY_WRITE, buf, count))
L
Linus Torvalds 已提交
785 786 787 788 789
		goto out;
	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
790 791 792
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
793

794
	retval = nfs_get_user_pages(READ, (unsigned long) buf,
795
						count, &pages);
796
	if (retval < 0)
797
		goto out;
798
	page_count = retval;
799

800
	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
801
						pages, page_count);
L
Linus Torvalds 已提交
802
	if (retval > 0)
803
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
804 805 806 807 808 809 810 811 812

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
813 814
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
834
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
835
{
836
	ssize_t retval;
837 838
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
839 840 841
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

842
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
843
		file->f_dentry->d_parent->d_name.name,
844 845
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
846

847 848
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
849
		goto out;
850 851 852

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
853 854 855 856
		goto out;
	retval = 0;
	if (!count)
		goto out;
857 858

	retval = -EFAULT;
859
	if (!access_ok(VERIFY_READ, buf, count))
860
		goto out;
L
Linus Torvalds 已提交
861

T
Trond Myklebust 已提交
862 863 864
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
865

866
	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
867
						count, &pages);
868
	if (retval < 0)
869
		goto out;
870
	page_count = retval;
871

872
	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
873
					pos, pages, page_count);
874 875 876 877 878 879 880 881 882

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
L
Linus Torvalds 已提交
883 884
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
885

L
Linus Torvalds 已提交
886
	if (retval > 0)
887
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
888 889 890 891 892

out:
	return retval;
}

893 894 895 896
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
897 898 899 900 901 902 903 904 905 906 907 908
int nfs_init_directcache(void)
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
						0, SLAB_RECLAIM_ACCOUNT,
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

909 910 911 912
/**
 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
913 914 915 916 917
void nfs_destroy_directcache(void)
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}