direct.c 24.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57
 *
 */

#include <linux/config.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

C
Chuck Lever 已提交
58 59
#include "iostat.h"

L
Linus Torvalds 已提交
60 61 62 63 64 65 66 67 68
#define NFSDBG_FACILITY		NFSDBG_VFS

static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
69 70

	/* I/O parameters */
T
Trond Myklebust 已提交
71 72
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
73
	struct nfs_open_context	*ctx;		/* file open context info */
74
	struct kiocb *		iocb;		/* controlling i/o request */
75
	struct inode *		inode;		/* target file of i/o */
T
Trond Myklebust 已提交
76 77 78 79 80
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
81 82 83

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
T
Trond Myklebust 已提交
84
	int			outstanding;	/* i/os we're waiting for */
85
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
86
				error;		/* any reported error */
87
	struct completion	completion;	/* wait for i/o completion */
88 89 90 91 92 93 94

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
95 96
};

T
Trond Myklebust 已提交
97
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
98
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
99

L
Linus Torvalds 已提交
100
/**
101 102 103 104 105 106 107 108 109 110 111
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
L
Linus Torvalds 已提交
112
 */
113 114 115
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
116 117
			iocb->ki_filp->f_dentry->d_name.name,
			(long long) pos, nr_segs);
118 119 120 121

	return -EINVAL;
}

T
Trond Myklebust 已提交
122
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
123 124 125 126
{
	int i;
	for (i = 0; i < npages; i++) {
		struct page *page = pages[i];
T
Trond Myklebust 已提交
127
		if (do_dirty && !PageCompound(page))
128
			set_page_dirty_lock(page);
T
Trond Myklebust 已提交
129
		page_cache_release(page);
130
	}
T
Trond Myklebust 已提交
131
	kfree(pages);
132 133
}

T
Trond Myklebust 已提交
134
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
135
{
T
Trond Myklebust 已提交
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
		if (result != page_count) {
			/*
			 * If we got fewer pages than expected from
			 * get_user_pages(), the user buffer runs off the
			 * end of a mapping; return EFAULT.
			 */
			if (result >= 0) {
				nfs_free_user_pages(*pages, result, 0);
				result = -EFAULT;
			} else
				kfree(*pages);
			*pages = NULL;
		}
	}
	return result;
166 167
}

168
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
L
Linus Torvalds 已提交
169
{
170 171 172 173 174 175 176
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
177
	init_completion(&dreq->completion);
T
Trond Myklebust 已提交
178
	INIT_LIST_HEAD(&dreq->list);
179
	INIT_LIST_HEAD(&dreq->rewrite_list);
180
	dreq->iocb = NULL;
181
	dreq->ctx = NULL;
182
	spin_lock_init(&dreq->lock);
T
Trond Myklebust 已提交
183
	dreq->outstanding = 0;
184 185
	dreq->count = 0;
	dreq->error = 0;
186
	dreq->flags = 0;
187 188

	return dreq;
L
Linus Torvalds 已提交
189 190 191 192 193
}

static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
194 195 196

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
197 198 199
	kmem_cache_free(nfs_direct_cachep, dreq);
}

200 201 202 203 204
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
205
	ssize_t result = -EIOCBQUEUED;
206 207 208 209 210

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

211
	result = wait_for_completion_interruptible(&dreq->completion);
212 213

	if (!result)
214
		result = dreq->error;
215
	if (!result)
216
		result = dreq->count;
217 218 219 220 221 222

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

223
/*
T
Trond Myklebust 已提交
224 225 226 227 228 229 230
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
231 232 233
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
T
Trond Myklebust 已提交
234 235
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

236
	if (dreq->iocb) {
237
		long res = (long) dreq->error;
238
		if (!res)
239
			res = (long) dreq->count;
240
		aio_complete(dreq->iocb, res, 0);
241 242
	}
	complete_all(&dreq->completion);
243 244 245 246

	kref_put(&dreq->kref, nfs_direct_req_release);
}

247
/*
T
Trond Myklebust 已提交
248 249 250
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
251
 */
T
Trond Myklebust 已提交
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
		dreq->outstanding++;
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

T
Trond Myklebust 已提交
290
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
291
{
T
Trond Myklebust 已提交
292
	struct nfs_read_data *data = calldata;
L
Linus Torvalds 已提交
293 294
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

T
Trond Myklebust 已提交
295 296
	if (nfs_readpage_result(task, data) != 0)
		return;
297 298 299

	spin_lock(&dreq->lock);

T
Trond Myklebust 已提交
300
	if (likely(task->tk_status >= 0))
301
		dreq->count += data->res.count;
L
Linus Torvalds 已提交
302
	else
303
		dreq->error = task->tk_status;
L
Linus Torvalds 已提交
304

T
Trond Myklebust 已提交
305 306 307 308
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
309

T
Trond Myklebust 已提交
310 311
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
L
Linus Torvalds 已提交
312 313
}

T
Trond Myklebust 已提交
314 315 316 317 318
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

319
/*
T
Trond Myklebust 已提交
320 321
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
L
Linus Torvalds 已提交
322
 */
T
Trond Myklebust 已提交
323
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
324
{
325 326
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
T
Trond Myklebust 已提交
327 328 329 330
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
331
	size_t rsize = NFS_SERVER(inode)->rsize;
T
Trond Myklebust 已提交
332
	unsigned int curpage, pgbase;
L
Linus Torvalds 已提交
333

T
Trond Myklebust 已提交
334 335
	curpage = 0;
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
336
	do {
337
		struct nfs_read_data *data;
338
		size_t bytes;
L
Linus Torvalds 已提交
339 340 341 342 343

		bytes = rsize;
		if (count < rsize)
			bytes = count;

T
Trond Myklebust 已提交
344 345 346
		BUG_ON(list_empty(list));
		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);
347

L
Linus Torvalds 已提交
348 349 350 351
		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
352
		data->args.offset = pos;
L
Linus Torvalds 已提交
353
		data->args.pgbase = pgbase;
T
Trond Myklebust 已提交
354
		data->args.pages = &pages[curpage];
L
Linus Torvalds 已提交
355 356 357 358 359
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

T
Trond Myklebust 已提交
360 361
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
L
Linus Torvalds 已提交
362 363 364 365 366 367 368 369
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

370
		dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
L
Linus Torvalds 已提交
371 372 373 374 375 376
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

377
		pos += bytes;
L
Linus Torvalds 已提交
378
		pgbase += bytes;
T
Trond Myklebust 已提交
379
		curpage += pgbase >> PAGE_SHIFT;
L
Linus Torvalds 已提交
380 381 382 383
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
T
Trond Myklebust 已提交
384
	BUG_ON(!list_empty(list));
L
Linus Torvalds 已提交
385 386
}

T
Trond Myklebust 已提交
387
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
L
Linus Torvalds 已提交
388
{
T
Trond Myklebust 已提交
389
	ssize_t result;
L
Linus Torvalds 已提交
390
	sigset_t oldset;
391
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
392 393 394
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

T
Trond Myklebust 已提交
395
	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
L
Linus Torvalds 已提交
396 397 398
	if (!dreq)
		return -ENOMEM;

T
Trond Myklebust 已提交
399 400 401 402 403
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->npages = nr_pages;
C
Chuck Lever 已提交
404
	dreq->inode = inode;
405
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
406 407
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
408

C
Chuck Lever 已提交
409
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
410
	rpc_clnt_sigmask(clnt, &oldset);
T
Trond Myklebust 已提交
411 412
	nfs_direct_read_schedule(dreq);
	result = nfs_direct_wait(dreq);
L
Linus Torvalds 已提交
413 414 415 416 417
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

418
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
419
{
T
Trond Myklebust 已提交
420 421 422
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
423 424 425 426
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}
L
Linus Torvalds 已提交
427

428 429 430
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
T
Trond Myklebust 已提交
431
	struct list_head *pos;
L
Linus Torvalds 已提交
432

T
Trond Myklebust 已提交
433 434 435
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
436
	dreq->count = 0;
437

T
Trond Myklebust 已提交
438
	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
L
Linus Torvalds 已提交
456 457
	}

458 459
	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
L
Linus Torvalds 已提交
460 461
}

462 463 464 465 466 467
static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
468
{
469
	struct nfs_write_data *data = dreq->commit_data;
L
Linus Torvalds 已提交
470

471
	data->inode = dreq->inode;
472
	data->cred = dreq->ctx->cred;
L
Linus Torvalds 已提交
473

474
	data->args.fh = NFS_FH(data->inode);
T
Trond Myklebust 已提交
475 476
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
477 478 479
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;
L
Linus Torvalds 已提交
480

481 482 483
	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);
L
Linus Torvalds 已提交
484

485 486 487 488
	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;
L
Linus Torvalds 已提交
489

490
	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
L
Linus Torvalds 已提交
491

492 493 494 495
	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}
L
Linus Torvalds 已提交
496

497 498 499
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;
L
Linus Torvalds 已提交
500

501 502 503 504
	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
L
Linus Torvalds 已提交
505
			break;
506 507 508 509 510 511 512 513 514 515 516
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}
L
Linus Torvalds 已提交
517

518 519 520 521 522 523 524 525 526 527 528
static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}
L
Linus Torvalds 已提交
529

530 531 532 533 534 535 536
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif
L
Linus Torvalds 已提交
537

T
Trond Myklebust 已提交
538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
		dreq->outstanding++;
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}

	nfs_alloc_commit_data(dreq);

	kref_get(&dreq->kref);
	return dreq;
}

579
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
580
{
581 582 583 584 585 586 587
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

588
	spin_lock(&dreq->lock);
L
Linus Torvalds 已提交
589

590
	if (likely(status >= 0))
591
		dreq->count += data->res.count;
592
	else
593
		dreq->error = task->tk_status;
L
Linus Torvalds 已提交
594

595 596 597 598 599
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
L
Linus Torvalds 已提交
600
				break;
601 602 603 604 605
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
L
Linus Torvalds 已提交
606 607
		}
	}
T
Trond Myklebust 已提交
608 609
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;
610 611

	spin_unlock(&dreq->lock);
L
Linus Torvalds 已提交
612 613
}

614 615 616
/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
L
Linus Torvalds 已提交
617
 */
618
static void nfs_direct_write_release(void *calldata)
L
Linus Torvalds 已提交
619
{
620 621
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
L
Linus Torvalds 已提交
622

T
Trond Myklebust 已提交
623 624 625 626 627 628 629 630
	spin_lock(&dreq->lock);
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
	spin_unlock(&dreq->lock);

	nfs_direct_write_complete(dreq, data->inode);
631 632 633 634
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
635
	.rpc_release = nfs_direct_write_release,
636 637 638
};

/*
T
Trond Myklebust 已提交
639 640
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
641
 */
T
Trond Myklebust 已提交
642
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
643
{
644 645
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
T
Trond Myklebust 已提交
646 647 648 649
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
650
	size_t wsize = NFS_SERVER(inode)->wsize;
T
Trond Myklebust 已提交
651
	unsigned int curpage, pgbase;
652

T
Trond Myklebust 已提交
653 654
	curpage = 0;
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
655
	do {
656
		struct nfs_write_data *data;
657 658 659 660 661 662
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

T
Trond Myklebust 已提交
663 664
		BUG_ON(list_empty(list));
		data = list_entry(list->next, struct nfs_write_data, pages);
665
		list_move_tail(&data->pages, &dreq->rewrite_list);
666 667 668 669 670

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
671
		data->args.offset = pos;
672
		data->args.pgbase = pgbase;
T
Trond Myklebust 已提交
673
		data->args.pages = &pages[curpage];
674 675 676
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
677
		data->res.verf = &data->verf;
678 679 680

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
681
		NFS_PROTO(inode)->write_setup(data, sync);
L
Linus Torvalds 已提交
682

683 684
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
L
Linus Torvalds 已提交
685 686

		lock_kernel();
687
		rpc_execute(&data->task);
L
Linus Torvalds 已提交
688 689
		unlock_kernel();

690
		dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
691 692 693 694 695
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
L
Linus Torvalds 已提交
696

697
		pos += bytes;
698
		pgbase += bytes;
T
Trond Myklebust 已提交
699
		curpage += pgbase >> PAGE_SHIFT;
700
		pgbase &= ~PAGE_MASK;
L
Linus Torvalds 已提交
701

702 703
		count -= bytes;
	} while (count != 0);
T
Trond Myklebust 已提交
704
	BUG_ON(!list_empty(list));
705
}
L
Linus Torvalds 已提交
706

T
Trond Myklebust 已提交
707
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
708
{
T
Trond Myklebust 已提交
709
	ssize_t result;
710
	sigset_t oldset;
711
	struct inode *inode = iocb->ki_filp->f_mapping->host;
712 713
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
714 715
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
L
Linus Torvalds 已提交
716

T
Trond Myklebust 已提交
717
	dreq = nfs_direct_write_alloc(count, wsize);
718 719
	if (!dreq)
		return -ENOMEM;
720 721
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
L
Linus Torvalds 已提交
722

T
Trond Myklebust 已提交
723 724 725 726 727
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->npages = nr_pages;
728
	dreq->inode = inode;
729
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
730 731
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
732

733 734
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

735
	nfs_begin_data_update(inode);
L
Linus Torvalds 已提交
736

737
	rpc_clnt_sigmask(clnt, &oldset);
T
Trond Myklebust 已提交
738 739
	nfs_direct_write_schedule(dreq, sync);
	result = nfs_direct_wait(dreq);
740
	rpc_clnt_sigunmask(clnt, &oldset);
L
Linus Torvalds 已提交
741 742 743 744 745 746 747 748

	return result;
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
749 750
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
751 752 753 754 755 756
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
757
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
758 759
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
760
 *
L
Linus Torvalds 已提交
761 762 763 764 765
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
766
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
767 768
{
	ssize_t retval = -EINVAL;
T
Trond Myklebust 已提交
769 770
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
771 772 773
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

774
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
775 776
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
777
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
778 779 780 781

	if (count < 0)
		goto out;
	retval = -EFAULT;
782
	if (!access_ok(VERIFY_WRITE, buf, count))
L
Linus Torvalds 已提交
783 784 785 786 787
		goto out;
	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
788 789 790
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
791

T
Trond Myklebust 已提交
792 793 794 795 796 797 798 799
	retval = nfs_get_user_pages(READ, (unsigned long) buf,
						count, &pages);
	if (retval < 0)
		goto out;
	page_count = retval;

	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
						pages, page_count);
L
Linus Torvalds 已提交
800
	if (retval > 0)
801
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
802 803 804 805 806 807 808 809 810

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
811 812
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
832
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
833
{
834
	ssize_t retval;
T
Trond Myklebust 已提交
835 836
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
837 838 839
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

840
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
841
		file->f_dentry->d_parent->d_name.name,
842 843
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
844

845 846
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
847
		goto out;
848 849 850

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
851 852 853 854
		goto out;
	retval = 0;
	if (!count)
		goto out;
855 856

	retval = -EFAULT;
857
	if (!access_ok(VERIFY_READ, buf, count))
858
		goto out;
L
Linus Torvalds 已提交
859

T
Trond Myklebust 已提交
860 861 862
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
863

T
Trond Myklebust 已提交
864 865 866 867 868 869 870 871
	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
						count, &pages);
	if (retval < 0)
		goto out;
	page_count = retval;

	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
					pos, pages, page_count);
872 873 874 875 876 877 878 879 880

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
L
Linus Torvalds 已提交
881 882
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
883

L
Linus Torvalds 已提交
884
	if (retval > 0)
885
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
886 887 888 889 890

out:
	return retval;
}

891 892 893 894
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
D
David Howells 已提交
895
int __init nfs_init_directcache(void)
L
Linus Torvalds 已提交
896 897 898
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
899 900
						0, (SLAB_RECLAIM_ACCOUNT|
							SLAB_MEM_SPREAD),
L
Linus Torvalds 已提交
901 902 903 904 905 906 907
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

908
/**
D
David Howells 已提交
909
 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
910 911
 *
 */
D
David Howells 已提交
912
void __exit nfs_destroy_directcache(void)
L
Linus Torvalds 已提交
913 914 915 916
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}