direct.c 25.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

C
Chuck Lever 已提交
58 59
#include "iostat.h"

L
Linus Torvalds 已提交
60 61
#define NFSDBG_FACILITY		NFSDBG_VFS

62
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty);
L
Linus Torvalds 已提交
63 64 65 66 67 68 69
static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
70 71

	/* I/O parameters */
72 73
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
74
	struct nfs_open_context	*ctx;		/* file open context info */
75
	struct kiocb *		iocb;		/* controlling i/o request */
L
Linus Torvalds 已提交
76
	wait_queue_head_t	wait;		/* wait for i/o completion */
77
	struct inode *		inode;		/* target file of i/o */
78 79 80
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
L
Linus Torvalds 已提交
81 82
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
83 84 85 86 87

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
	int			outstanding;	/* i/os we're waiting for */
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
88
				error;		/* any reported error */
89 90 91 92 93 94 95

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
96 97
};

98 99 100
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);

101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
/**
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
 */
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	struct dentry *dentry = iocb->ki_filp->f_dentry;

	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
			dentry->d_name.name, (long long) pos, nr_segs);

	return -EINVAL;
}

124
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
L
Linus Torvalds 已提交
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
{
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
141 142 143 144 145 146 147 148 149
		/*
		 * If we got fewer pages than expected from get_user_pages(),
		 * the user buffer runs off the end of a mapping; return EFAULT.
		 */
		if (result >= 0 && result < page_count) {
			nfs_free_user_pages(*pages, result, 0);
			*pages = NULL;
			result = -EFAULT;
		}
L
Linus Torvalds 已提交
150 151 152 153
	}
	return result;
}

154
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
L
Linus Torvalds 已提交
155 156 157
{
	int i;
	for (i = 0; i < npages; i++) {
158 159 160 161
		struct page *page = pages[i];
		if (do_dirty && !PageCompound(page))
			set_page_dirty_lock(page);
		page_cache_release(page);
L
Linus Torvalds 已提交
162 163 164 165
	}
	kfree(pages);
}

166 167 168 169 170 171 172 173 174 175 176
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
	init_waitqueue_head(&dreq->wait);
	INIT_LIST_HEAD(&dreq->list);
177
	INIT_LIST_HEAD(&dreq->rewrite_list);
178
	dreq->iocb = NULL;
179
	dreq->ctx = NULL;
180 181 182 183
	spin_lock_init(&dreq->lock);
	dreq->outstanding = 0;
	dreq->count = 0;
	dreq->error = 0;
184
	dreq->flags = 0;
185 186 187 188

	return dreq;
}

L
Linus Torvalds 已提交
189 190 191
static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
192 193 194

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
195 196 197
	kmem_cache_free(nfs_direct_cachep, dreq);
}

198 199 200 201 202
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
203
	ssize_t result = -EIOCBQUEUED;
204 205 206 207 208

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

209
	result = wait_event_interruptible(dreq->wait, (dreq->outstanding == 0));
210 211

	if (!result)
212
		result = dreq->error;
213
	if (!result)
214
		result = dreq->count;
215 216 217 218 219 220

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

221 222 223 224 225 226 227 228 229 230 231 232 233 234
/*
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

	if (dreq->iocb) {
235
		long res = (long) dreq->error;
236
		if (!res)
237
			res = (long) dreq->count;
238 239 240 241 242 243 244
		aio_complete(dreq->iocb, res, 0);
	} else
		wake_up(&dreq->wait);

	kref_put(&dreq->kref, nfs_direct_req_release);
}

245
/*
L
Linus Torvalds 已提交
246 247 248 249
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
 */
250
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
L
Linus Torvalds 已提交
251 252 253
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
254
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
255

256
	dreq = nfs_direct_req_alloc();
L
Linus Torvalds 已提交
257 258 259 260 261
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
262
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
279
		dreq->outstanding++;
L
Linus Torvalds 已提交
280 281 282 283 284 285 286 287
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

T
Trond Myklebust 已提交
288
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
289
{
T
Trond Myklebust 已提交
290
	struct nfs_read_data *data = calldata;
L
Linus Torvalds 已提交
291 292
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

T
Trond Myklebust 已提交
293 294
	if (nfs_readpage_result(task, data) != 0)
		return;
295 296 297

	spin_lock(&dreq->lock);

T
Trond Myklebust 已提交
298
	if (likely(task->tk_status >= 0))
299
		dreq->count += data->res.count;
L
Linus Torvalds 已提交
300
	else
301 302 303 304 305 306
		dreq->error = task->tk_status;

	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
L
Linus Torvalds 已提交
307

308 309
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
L
Linus Torvalds 已提交
310 311
}

T
Trond Myklebust 已提交
312 313 314 315 316
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

317
/*
L
Linus Torvalds 已提交
318 319 320
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
 */
321
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
322
{
323 324
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
L
Linus Torvalds 已提交
325 326
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
327 328
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
329
	size_t rsize = NFS_SERVER(inode)->rsize;
L
Linus Torvalds 已提交
330 331 332
	unsigned int curpage, pgbase;

	curpage = 0;
333
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
334 335
	do {
		struct nfs_read_data *data;
336
		size_t bytes;
L
Linus Torvalds 已提交
337 338 339 340 341

		bytes = rsize;
		if (count < rsize)
			bytes = count;

342
		BUG_ON(list_empty(list));
L
Linus Torvalds 已提交
343 344 345 346 347 348 349
		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
350
		data->args.offset = pos;
L
Linus Torvalds 已提交
351 352 353 354 355 356 357
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

T
Trond Myklebust 已提交
358 359
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
L
Linus Torvalds 已提交
360 361 362 363 364 365 366 367
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

368
		dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
L
Linus Torvalds 已提交
369 370 371 372 373 374
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

375
		pos += bytes;
L
Linus Torvalds 已提交
376 377 378 379 380 381
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
382
	BUG_ON(!list_empty(list));
L
Linus Torvalds 已提交
383 384
}

385
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
L
Linus Torvalds 已提交
386 387 388
{
	ssize_t result;
	sigset_t oldset;
389
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
390 391 392 393 394 395 396
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
	if (!dreq)
		return -ENOMEM;

397 398 399
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
L
Linus Torvalds 已提交
400 401
	dreq->pages = pages;
	dreq->npages = nr_pages;
C
Chuck Lever 已提交
402
	dreq->inode = inode;
403
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
404 405
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
406

C
Chuck Lever 已提交
407
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
408
	rpc_clnt_sigmask(clnt, &oldset);
409
	nfs_direct_read_schedule(dreq);
410
	result = nfs_direct_wait(dreq);
L
Linus Torvalds 已提交
411 412 413 414 415
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
{
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}

#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
	struct list_head *pos;

	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
	dreq->count = 0;

	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}

	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
}

static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
{
	struct nfs_write_data *data = dreq->commit_data;
	struct rpc_task *task = &data->task;

	data->inode = dreq->inode;
471
	data->cred = dreq->ctx->cred;
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536

	data->args.fh = NFS_FH(data->inode);
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;

	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);

	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;

	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);

	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;

	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
			break;
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}

static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}

static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif

537
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
L
Linus Torvalds 已提交
538
{
539 540 541
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
L
Linus Torvalds 已提交
542

543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565
	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
566
		dreq->outstanding++;
567 568 569 570
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}
571 572 573

	nfs_alloc_commit_data(dreq);

574 575 576 577 578 579 580 581 582 583 584 585 586
	kref_get(&dreq->kref);
	return dreq;
}

static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

587 588
	spin_lock(&dreq->lock);

589
	if (likely(status >= 0))
590
		dreq->count += data->res.count;
591
	else
592
		dreq->error = task->tk_status;
593

594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
				break;
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
		}
	}
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;

	spin_unlock(&dreq->lock);
}

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_release(void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	spin_lock(&dreq->lock);
623 624 625
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
626
	}
627 628
	spin_unlock(&dreq->lock);

629
	nfs_direct_write_complete(dreq, data->inode);
630 631 632 633
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
634
	.rpc_release = nfs_direct_write_release,
635 636 637 638 639 640
};

/*
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
 */
641
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
642
{
643 644
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
645 646
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
647 648
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
649 650
	size_t wsize = NFS_SERVER(inode)->wsize;
	unsigned int curpage, pgbase;
L
Linus Torvalds 已提交
651 652

	curpage = 0;
653
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
654
	do {
655 656 657 658 659 660 661
		struct nfs_write_data *data;
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

662
		BUG_ON(list_empty(list));
663
		data = list_entry(list->next, struct nfs_write_data, pages);
664
		list_move_tail(&data->pages, &dreq->rewrite_list);
665 666 667 668 669

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
670
		data->args.offset = pos;
671 672 673 674 675
		data->args.pgbase = pgbase;
		data->args.pages = &pages[curpage];
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
676
		data->res.verf = &data->verf;
677 678 679

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
680
		NFS_PROTO(inode)->write_setup(data, sync);
L
Linus Torvalds 已提交
681

682 683
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
L
Linus Torvalds 已提交
684 685

		lock_kernel();
686
		rpc_execute(&data->task);
L
Linus Torvalds 已提交
687 688
		unlock_kernel();

689
		dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
690 691 692 693 694
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
L
Linus Torvalds 已提交
695

696
		pos += bytes;
697 698 699
		pgbase += bytes;
		curpage += pgbase >> PAGE_SHIFT;
		pgbase &= ~PAGE_MASK;
L
Linus Torvalds 已提交
700

701 702
		count -= bytes;
	} while (count != 0);
703
	BUG_ON(!list_empty(list));
704
}
L
Linus Torvalds 已提交
705

706
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
707 708 709
{
	ssize_t result;
	sigset_t oldset;
710
	struct inode *inode = iocb->ki_filp->f_mapping->host;
711 712
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
713 714
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
L
Linus Torvalds 已提交
715

716
	dreq = nfs_direct_write_alloc(count, wsize);
717 718
	if (!dreq)
		return -ENOMEM;
719 720
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
L
Linus Torvalds 已提交
721

722 723 724
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
725 726
	dreq->pages = pages;
	dreq->npages = nr_pages;
727
	dreq->inode = inode;
728
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
729 730
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
731

732 733
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

734
	nfs_begin_data_update(inode);
L
Linus Torvalds 已提交
735

736
	rpc_clnt_sigmask(clnt, &oldset);
737
	nfs_direct_write_schedule(dreq, sync);
738
	result = nfs_direct_wait(dreq);
739
	rpc_clnt_sigunmask(clnt, &oldset);
L
Linus Torvalds 已提交
740

741
	return result;
L
Linus Torvalds 已提交
742 743 744 745 746 747
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
748 749
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
750 751 752 753 754 755
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
756
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
757 758
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
759
 *
L
Linus Torvalds 已提交
760 761 762 763 764
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
765
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
766 767
{
	ssize_t retval = -EINVAL;
768 769
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
770 771 772
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

773
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
774 775
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
776
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
777 778 779 780

	if (count < 0)
		goto out;
	retval = -EFAULT;
781
	if (!access_ok(VERIFY_WRITE, buf, count))
L
Linus Torvalds 已提交
782 783 784 785 786
		goto out;
	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
787 788 789
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
790

791 792 793 794 795 796 797 798
	page_count = nfs_get_user_pages(READ, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

799
	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
800
						pages, page_count);
L
Linus Torvalds 已提交
801
	if (retval > 0)
802
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
803 804 805 806 807 808 809 810 811

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
812 813
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
833
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
834
{
835
	ssize_t retval;
836 837
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
838 839 840
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

841
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
842
		file->f_dentry->d_parent->d_name.name,
843 844
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
845

846 847
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
848
		goto out;
849 850 851

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
852 853 854 855
		goto out;
	retval = 0;
	if (!count)
		goto out;
856 857

	retval = -EFAULT;
858
	if (!access_ok(VERIFY_READ, buf, count))
859
		goto out;
L
Linus Torvalds 已提交
860

T
Trond Myklebust 已提交
861 862 863
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
864

865 866 867 868 869 870 871 872
	page_count = nfs_get_user_pages(WRITE, (unsigned long) buf,
						count, &pages);
	if (page_count < 0) {
		nfs_free_user_pages(pages, 0, 0);
		retval = page_count;
		goto out;
	}

873
	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
874
					pos, pages, page_count);
875 876 877 878 879 880 881 882 883

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
L
Linus Torvalds 已提交
884 885
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
886

L
Linus Torvalds 已提交
887
	if (retval > 0)
888
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
889 890 891 892 893

out:
	return retval;
}

894 895 896 897
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
898 899 900 901 902 903 904 905 906 907 908 909
int nfs_init_directcache(void)
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
						0, SLAB_RECLAIM_ACCOUNT,
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

910 911 912 913
/**
 * nfs_init_directcache - destroy the slab cache for nfs_direct_req structures
 *
 */
L
Linus Torvalds 已提交
914 915 916 917 918
void nfs_destroy_directcache(void)
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}