direct.c 24.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
 *
 */

#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/smp_lock.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/atomic.h>

C
Chuck Lever 已提交
57 58
#include "iostat.h"

L
Linus Torvalds 已提交
59 60 61 62 63 64 65 66 67
#define NFSDBG_FACILITY		NFSDBG_VFS

static kmem_cache_t *nfs_direct_cachep;

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
68 69

	/* I/O parameters */
T
Trond Myklebust 已提交
70 71
	struct list_head	list,		/* nfs_read/write_data structs */
				rewrite_list;	/* saved nfs_write_data structs */
72
	struct nfs_open_context	*ctx;		/* file open context info */
73
	struct kiocb *		iocb;		/* controlling i/o request */
74
	struct inode *		inode;		/* target file of i/o */
T
Trond Myklebust 已提交
75 76 77 78 79
	unsigned long		user_addr;	/* location of user's buffer */
	size_t			user_count;	/* total bytes to move */
	loff_t			pos;		/* starting offset in file */
	struct page **		pages;		/* pages in our buffer */
	unsigned int		npages;		/* count of pages */
80 81 82

	/* completion state */
	spinlock_t		lock;		/* protect completion state */
T
Trond Myklebust 已提交
83
	int			outstanding;	/* i/os we're waiting for */
84
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
85
				error;		/* any reported error */
86
	struct completion	completion;	/* wait for i/o completion */
87 88 89 90 91 92 93

	/* commit state */
	struct nfs_write_data *	commit_data;	/* special write_data for commits */
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
94 95
};

T
Trond Myklebust 已提交
96
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync);
97
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
98

L
Linus Torvalds 已提交
99
/**
100 101 102 103 104 105 106 107 108 109 110
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
L
Linus Torvalds 已提交
111
 */
112 113 114
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
115 116
			iocb->ki_filp->f_dentry->d_name.name,
			(long long) pos, nr_segs);
117 118 119 120

	return -EINVAL;
}

T
Trond Myklebust 已提交
121
static void nfs_free_user_pages(struct page **pages, int npages, int do_dirty)
122 123 124 125
{
	int i;
	for (i = 0; i < npages; i++) {
		struct page *page = pages[i];
T
Trond Myklebust 已提交
126
		if (do_dirty && !PageCompound(page))
127
			set_page_dirty_lock(page);
T
Trond Myklebust 已提交
128
		page_cache_release(page);
129
	}
T
Trond Myklebust 已提交
130
	kfree(pages);
131 132
}

T
Trond Myklebust 已提交
133
static inline int nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, struct page ***pages)
134
{
T
Trond Myklebust 已提交
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
	int result = -ENOMEM;
	unsigned long page_count;
	size_t array_size;

	page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	page_count -= user_addr >> PAGE_SHIFT;

	array_size = (page_count * sizeof(struct page *));
	*pages = kmalloc(array_size, GFP_KERNEL);
	if (*pages) {
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
					page_count, (rw == READ), 0,
					*pages, NULL);
		up_read(&current->mm->mmap_sem);
		if (result != page_count) {
			/*
			 * If we got fewer pages than expected from
			 * get_user_pages(), the user buffer runs off the
			 * end of a mapping; return EFAULT.
			 */
			if (result >= 0) {
				nfs_free_user_pages(*pages, result, 0);
				result = -EFAULT;
			} else
				kfree(*pages);
			*pages = NULL;
		}
	}
	return result;
165 166
}

167
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
L
Linus Torvalds 已提交
168
{
169 170 171 172 173 174 175
	struct nfs_direct_req *dreq;

	dreq = kmem_cache_alloc(nfs_direct_cachep, SLAB_KERNEL);
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
176
	init_completion(&dreq->completion);
T
Trond Myklebust 已提交
177
	INIT_LIST_HEAD(&dreq->list);
178
	INIT_LIST_HEAD(&dreq->rewrite_list);
179
	dreq->iocb = NULL;
180
	dreq->ctx = NULL;
181
	spin_lock_init(&dreq->lock);
T
Trond Myklebust 已提交
182
	dreq->outstanding = 0;
183 184
	dreq->count = 0;
	dreq->error = 0;
185
	dreq->flags = 0;
186 187

	return dreq;
L
Linus Torvalds 已提交
188 189 190 191 192
}

static void nfs_direct_req_release(struct kref *kref)
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
193 194 195

	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
196 197 198
	kmem_cache_free(nfs_direct_cachep, dreq);
}

199 200 201 202 203
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
204
	ssize_t result = -EIOCBQUEUED;
205 206 207 208 209

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

210
	result = wait_for_completion_interruptible(&dreq->completion);
211 212

	if (!result)
213
		result = dreq->error;
214
	if (!result)
215
		result = dreq->count;
216 217 218 219 220 221

out:
	kref_put(&dreq->kref, nfs_direct_req_release);
	return (ssize_t) result;
}

222
/*
T
Trond Myklebust 已提交
223 224 225 226 227 228 229
 * We must hold a reference to all the pages in this direct read request
 * until the RPCs complete.  This could be long *after* we are woken up in
 * nfs_direct_wait (for instance, if someone hits ^C on a slow server).
 *
 * In addition, synchronous I/O uses a stack-allocated iocb.  Thus we
 * can't trust the iocb is still valid here if this is a synchronous
 * request.  If the waiter is woken prematurely, the iocb is long gone.
230 231 232
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
T
Trond Myklebust 已提交
233 234
	nfs_free_user_pages(dreq->pages, dreq->npages, 1);

235
	if (dreq->iocb) {
236
		long res = (long) dreq->error;
237
		if (!res)
238
			res = (long) dreq->count;
239
		aio_complete(dreq->iocb, res, 0);
240 241
	}
	complete_all(&dreq->completion);
242 243 244 245

	kref_put(&dreq->kref, nfs_direct_req_release);
}

246
/*
T
Trond Myklebust 已提交
247 248 249
 * Note we also set the number of requests we have in the dreq when we are
 * done.  This prevents races with I/O completion so we will always wait
 * until all requests have been dispatched and completed.
250
 */
T
Trond Myklebust 已提交
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288
static struct nfs_direct_req *nfs_direct_read_alloc(size_t nbytes, size_t rsize)
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int rpages = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_read_data *data = nfs_readdata_alloc(rpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_read_data, pages);
				list_del(&data->pages);
				nfs_readdata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
		dreq->outstanding++;
		if (nbytes <= rsize)
			break;
		nbytes -= rsize;
	}
	kref_get(&dreq->kref);
	return dreq;
}

T
Trond Myklebust 已提交
289
static void nfs_direct_read_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
290
{
T
Trond Myklebust 已提交
291
	struct nfs_read_data *data = calldata;
L
Linus Torvalds 已提交
292 293
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

T
Trond Myklebust 已提交
294 295
	if (nfs_readpage_result(task, data) != 0)
		return;
296 297 298

	spin_lock(&dreq->lock);

T
Trond Myklebust 已提交
299
	if (likely(task->tk_status >= 0))
300
		dreq->count += data->res.count;
L
Linus Torvalds 已提交
301
	else
302
		dreq->error = task->tk_status;
L
Linus Torvalds 已提交
303

T
Trond Myklebust 已提交
304 305 306 307
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
308

T
Trond Myklebust 已提交
309 310
	spin_unlock(&dreq->lock);
	nfs_direct_complete(dreq);
L
Linus Torvalds 已提交
311 312
}

T
Trond Myklebust 已提交
313 314 315 316 317
static const struct rpc_call_ops nfs_read_direct_ops = {
	.rpc_call_done = nfs_direct_read_result,
	.rpc_release = nfs_readdata_release,
};

318
/*
T
Trond Myklebust 已提交
319 320
 * For each nfs_read_data struct that was allocated on the list, dispatch
 * an NFS READ operation
L
Linus Torvalds 已提交
321
 */
T
Trond Myklebust 已提交
322
static void nfs_direct_read_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
323
{
324 325
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
T
Trond Myklebust 已提交
326 327 328 329
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
330
	size_t rsize = NFS_SERVER(inode)->rsize;
T
Trond Myklebust 已提交
331
	unsigned int curpage, pgbase;
L
Linus Torvalds 已提交
332

T
Trond Myklebust 已提交
333 334
	curpage = 0;
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
335
	do {
336
		struct nfs_read_data *data;
337
		size_t bytes;
L
Linus Torvalds 已提交
338 339 340 341 342

		bytes = rsize;
		if (count < rsize)
			bytes = count;

T
Trond Myklebust 已提交
343 344 345
		BUG_ON(list_empty(list));
		data = list_entry(list->next, struct nfs_read_data, pages);
		list_del_init(&data->pages);
346

L
Linus Torvalds 已提交
347 348 349 350
		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
351
		data->args.offset = pos;
L
Linus Torvalds 已提交
352
		data->args.pgbase = pgbase;
T
Trond Myklebust 已提交
353
		data->args.pages = &pages[curpage];
L
Linus Torvalds 已提交
354 355 356 357 358
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.eof = 0;
		data->res.count = bytes;

T
Trond Myklebust 已提交
359 360
		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_read_direct_ops, data);
L
Linus Torvalds 已提交
361 362 363 364 365 366 367 368
		NFS_PROTO(inode)->read_setup(data);

		data->task.tk_cookie = (unsigned long) inode;

		lock_kernel();
		rpc_execute(&data->task);
		unlock_kernel();

369
		dfprintk(VFS, "NFS: %5u initiated direct read call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
L
Linus Torvalds 已提交
370 371 372 373 374 375
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);

376
		pos += bytes;
L
Linus Torvalds 已提交
377
		pgbase += bytes;
T
Trond Myklebust 已提交
378
		curpage += pgbase >> PAGE_SHIFT;
L
Linus Torvalds 已提交
379 380 381 382
		pgbase &= ~PAGE_MASK;

		count -= bytes;
	} while (count != 0);
T
Trond Myklebust 已提交
383
	BUG_ON(!list_empty(list));
L
Linus Torvalds 已提交
384 385
}

T
Trond Myklebust 已提交
386
static ssize_t nfs_direct_read(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, unsigned int nr_pages)
L
Linus Torvalds 已提交
387
{
T
Trond Myklebust 已提交
388
	ssize_t result;
L
Linus Torvalds 已提交
389
	sigset_t oldset;
390
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
391 392 393
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;

T
Trond Myklebust 已提交
394
	dreq = nfs_direct_read_alloc(count, NFS_SERVER(inode)->rsize);
L
Linus Torvalds 已提交
395 396 397
	if (!dreq)
		return -ENOMEM;

T
Trond Myklebust 已提交
398 399 400 401 402
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->npages = nr_pages;
C
Chuck Lever 已提交
403
	dreq->inode = inode;
404
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
405 406
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
407

C
Chuck Lever 已提交
408
	nfs_add_stats(inode, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
409
	rpc_clnt_sigmask(clnt, &oldset);
T
Trond Myklebust 已提交
410 411
	nfs_direct_read_schedule(dreq);
	result = nfs_direct_wait(dreq);
L
Linus Torvalds 已提交
412 413 414 415 416
	rpc_clnt_sigunmask(clnt, &oldset);

	return result;
}

417
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
418
{
T
Trond Myklebust 已提交
419 420 421
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	while (!list_empty(&dreq->list)) {
		struct nfs_write_data *data = list_entry(dreq->list.next, struct nfs_write_data, pages);
422 423 424 425
		list_del(&data->pages);
		nfs_writedata_release(data);
	}
}
L
Linus Torvalds 已提交
426

427 428 429
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
T
Trond Myklebust 已提交
430
	struct list_head *pos;
L
Linus Torvalds 已提交
431

T
Trond Myklebust 已提交
432 433 434
	list_splice_init(&dreq->rewrite_list, &dreq->list);
	list_for_each(pos, &dreq->list)
		dreq->outstanding++;
435
	dreq->count = 0;
436

T
Trond Myklebust 已提交
437
	nfs_direct_write_schedule(dreq, FLUSH_STABLE);
438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;

	/* Call the NFS version-specific code */
	if (NFS_PROTO(data->inode)->commit_done(task, data) != 0)
		return;
	if (unlikely(task->tk_status < 0)) {
		dreq->error = task->tk_status;
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
	}
	if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
		dprintk("NFS: %5u commit verify failed\n", task->tk_pid);
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
L
Linus Torvalds 已提交
455 456
	}

457 458
	dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status);
	nfs_direct_write_complete(dreq, data->inode);
L
Linus Torvalds 已提交
459 460
}

461 462 463 464 465 466
static const struct rpc_call_ops nfs_commit_direct_ops = {
	.rpc_call_done = nfs_direct_commit_result,
	.rpc_release = nfs_commit_release,
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
467
{
468
	struct nfs_write_data *data = dreq->commit_data;
L
Linus Torvalds 已提交
469

470
	data->inode = dreq->inode;
471
	data->cred = dreq->ctx->cred;
L
Linus Torvalds 已提交
472

473
	data->args.fh = NFS_FH(data->inode);
T
Trond Myklebust 已提交
474 475
	data->args.offset = dreq->pos;
	data->args.count = dreq->user_count;
476 477 478
	data->res.count = 0;
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;
L
Linus Torvalds 已提交
479

480 481 482
	rpc_init_task(&data->task, NFS_CLIENT(dreq->inode), RPC_TASK_ASYNC,
				&nfs_commit_direct_ops, data);
	NFS_PROTO(data->inode)->commit_setup(data, 0);
L
Linus Torvalds 已提交
483

484 485 486 487
	data->task.tk_priority = RPC_PRIORITY_NORMAL;
	data->task.tk_cookie = (unsigned long)data->inode;
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;
L
Linus Torvalds 已提交
488

489
	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
L
Linus Torvalds 已提交
490

491 492 493 494
	lock_kernel();
	rpc_execute(&data->task);
	unlock_kernel();
}
L
Linus Torvalds 已提交
495

496 497 498
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;
L
Linus Torvalds 已提交
499

500 501 502 503
	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
L
Linus Torvalds 已提交
504
			break;
505 506 507 508 509 510 511 512 513 514 515
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			nfs_end_data_update(inode);
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
			nfs_direct_complete(dreq);
	}
}
L
Linus Torvalds 已提交
516

517 518 519 520 521 522 523 524 525 526 527
static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = nfs_commit_alloc(0);
	if (dreq->commit_data != NULL)
		dreq->commit_data->req = (struct nfs_page *) dreq;
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}
L
Linus Torvalds 已提交
528

529 530 531 532 533 534 535
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_end_data_update(inode);
	nfs_direct_free_writedata(dreq);
	nfs_direct_complete(dreq);
}
#endif
L
Linus Torvalds 已提交
536

T
Trond Myklebust 已提交
537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
static struct nfs_direct_req *nfs_direct_write_alloc(size_t nbytes, size_t wsize)
{
	struct list_head *list;
	struct nfs_direct_req *dreq;
	unsigned int wpages = (wsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;

	dreq = nfs_direct_req_alloc();
	if (!dreq)
		return NULL;

	list = &dreq->list;
	for(;;) {
		struct nfs_write_data *data = nfs_writedata_alloc(wpages);

		if (unlikely(!data)) {
			while (!list_empty(list)) {
				data = list_entry(list->next,
						  struct nfs_write_data, pages);
				list_del(&data->pages);
				nfs_writedata_free(data);
			}
			kref_put(&dreq->kref, nfs_direct_req_release);
			return NULL;
		}

		INIT_LIST_HEAD(&data->pages);
		list_add(&data->pages, list);

		data->req = (struct nfs_page *) dreq;
		dreq->outstanding++;
		if (nbytes <= wsize)
			break;
		nbytes -= wsize;
	}

	nfs_alloc_commit_data(dreq);

	kref_get(&dreq->kref);
	return dreq;
}

578
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
579
{
580 581 582 583 584 585 586
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
	int status = task->tk_status;

	if (nfs_writeback_done(task, data) != 0)
		return;

587
	spin_lock(&dreq->lock);
L
Linus Torvalds 已提交
588

589
	if (likely(status >= 0))
590
		dreq->count += data->res.count;
591
	else
592
		dreq->error = task->tk_status;
L
Linus Torvalds 已提交
593

594 595 596 597 598
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
L
Linus Torvalds 已提交
599
				break;
600 601 602 603 604
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
					dprintk("NFS: %5u write verify failed\n", task->tk_pid);
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
L
Linus Torvalds 已提交
605 606
		}
	}
T
Trond Myklebust 已提交
607 608
	/* In case we have to resend */
	data->args.stable = NFS_FILE_SYNC;
609 610

	spin_unlock(&dreq->lock);
L
Linus Torvalds 已提交
611 612
}

613 614 615
/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
L
Linus Torvalds 已提交
616
 */
617
static void nfs_direct_write_release(void *calldata)
L
Linus Torvalds 已提交
618
{
619 620
	struct nfs_write_data *data = calldata;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req;
L
Linus Torvalds 已提交
621

T
Trond Myklebust 已提交
622 623 624 625 626 627 628 629
	spin_lock(&dreq->lock);
	if (--dreq->outstanding) {
		spin_unlock(&dreq->lock);
		return;
	}
	spin_unlock(&dreq->lock);

	nfs_direct_write_complete(dreq, data->inode);
630 631 632 633
}

static const struct rpc_call_ops nfs_write_direct_ops = {
	.rpc_call_done = nfs_direct_write_result,
634
	.rpc_release = nfs_direct_write_release,
635 636 637
};

/*
T
Trond Myklebust 已提交
638 639
 * For each nfs_write_data struct that was allocated on the list, dispatch
 * an NFS WRITE operation
640
 */
T
Trond Myklebust 已提交
641
static void nfs_direct_write_schedule(struct nfs_direct_req *dreq, int sync)
642
{
643 644
	struct nfs_open_context *ctx = dreq->ctx;
	struct inode *inode = ctx->dentry->d_inode;
T
Trond Myklebust 已提交
645 646 647 648
	struct list_head *list = &dreq->list;
	struct page **pages = dreq->pages;
	size_t count = dreq->user_count;
	loff_t pos = dreq->pos;
649
	size_t wsize = NFS_SERVER(inode)->wsize;
T
Trond Myklebust 已提交
650
	unsigned int curpage, pgbase;
651

T
Trond Myklebust 已提交
652 653
	curpage = 0;
	pgbase = dreq->user_addr & ~PAGE_MASK;
L
Linus Torvalds 已提交
654
	do {
655
		struct nfs_write_data *data;
656 657 658 659 660 661
		size_t bytes;

		bytes = wsize;
		if (count < wsize)
			bytes = count;

T
Trond Myklebust 已提交
662 663
		BUG_ON(list_empty(list));
		data = list_entry(list->next, struct nfs_write_data, pages);
664
		list_move_tail(&data->pages, &dreq->rewrite_list);
665 666 667 668 669

		data->inode = inode;
		data->cred = ctx->cred;
		data->args.fh = NFS_FH(inode);
		data->args.context = ctx;
670
		data->args.offset = pos;
671
		data->args.pgbase = pgbase;
T
Trond Myklebust 已提交
672
		data->args.pages = &pages[curpage];
673 674 675
		data->args.count = bytes;
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
676
		data->res.verf = &data->verf;
677 678 679

		rpc_init_task(&data->task, NFS_CLIENT(inode), RPC_TASK_ASYNC,
				&nfs_write_direct_ops, data);
680
		NFS_PROTO(inode)->write_setup(data, sync);
L
Linus Torvalds 已提交
681

682 683
		data->task.tk_priority = RPC_PRIORITY_NORMAL;
		data->task.tk_cookie = (unsigned long) inode;
L
Linus Torvalds 已提交
684 685

		lock_kernel();
686
		rpc_execute(&data->task);
L
Linus Torvalds 已提交
687 688
		unlock_kernel();

689
		dfprintk(VFS, "NFS: %5u initiated direct write call (req %s/%Ld, %zu bytes @ offset %Lu)\n",
690 691 692 693 694
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
L
Linus Torvalds 已提交
695

696
		pos += bytes;
697
		pgbase += bytes;
T
Trond Myklebust 已提交
698
		curpage += pgbase >> PAGE_SHIFT;
699
		pgbase &= ~PAGE_MASK;
L
Linus Torvalds 已提交
700

701 702
		count -= bytes;
	} while (count != 0);
T
Trond Myklebust 已提交
703
	BUG_ON(!list_empty(list));
704
}
L
Linus Torvalds 已提交
705

T
Trond Myklebust 已提交
706
static ssize_t nfs_direct_write(struct kiocb *iocb, unsigned long user_addr, size_t count, loff_t pos, struct page **pages, int nr_pages)
707
{
T
Trond Myklebust 已提交
708
	ssize_t result;
709
	sigset_t oldset;
710
	struct inode *inode = iocb->ki_filp->f_mapping->host;
711 712
	struct rpc_clnt *clnt = NFS_CLIENT(inode);
	struct nfs_direct_req *dreq;
713 714
	size_t wsize = NFS_SERVER(inode)->wsize;
	int sync = 0;
L
Linus Torvalds 已提交
715

T
Trond Myklebust 已提交
716
	dreq = nfs_direct_write_alloc(count, wsize);
717 718
	if (!dreq)
		return -ENOMEM;
719 720
	if (dreq->commit_data == NULL || count < wsize)
		sync = FLUSH_STABLE;
L
Linus Torvalds 已提交
721

T
Trond Myklebust 已提交
722 723 724 725 726
	dreq->user_addr = user_addr;
	dreq->user_count = count;
	dreq->pos = pos;
	dreq->pages = pages;
	dreq->npages = nr_pages;
727
	dreq->inode = inode;
728
	dreq->ctx = get_nfs_open_context((struct nfs_open_context *)iocb->ki_filp->private_data);
729 730
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
731

732 733
	nfs_add_stats(inode, NFSIOS_DIRECTWRITTENBYTES, count);

734
	nfs_begin_data_update(inode);
L
Linus Torvalds 已提交
735

736
	rpc_clnt_sigmask(clnt, &oldset);
T
Trond Myklebust 已提交
737 738
	nfs_direct_write_schedule(dreq, sync);
	result = nfs_direct_wait(dreq);
739
	rpc_clnt_sigunmask(clnt, &oldset);
L
Linus Torvalds 已提交
740 741 742 743 744 745 746 747

	return result;
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer into which to read data
748 749
 * @count: number of bytes to read
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
750 751 752 753 754 755
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
756
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
757 758
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
759
 *
L
Linus Torvalds 已提交
760 761 762 763 764
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
765
ssize_t nfs_file_direct_read(struct kiocb *iocb, char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
766 767
{
	ssize_t retval = -EINVAL;
T
Trond Myklebust 已提交
768 769
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
770 771 772
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

773
	dprintk("nfs: direct read(%s/%s, %lu@%Ld)\n",
774 775
		file->f_dentry->d_parent->d_name.name,
		file->f_dentry->d_name.name,
776
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
777 778 779 780

	if (count < 0)
		goto out;
	retval = -EFAULT;
781
	if (!access_ok(VERIFY_WRITE, buf, count))
L
Linus Torvalds 已提交
782 783 784 785 786
		goto out;
	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
787 788 789
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
790

T
Trond Myklebust 已提交
791 792 793 794 795 796 797 798
	retval = nfs_get_user_pages(READ, (unsigned long) buf,
						count, &pages);
	if (retval < 0)
		goto out;
	page_count = retval;

	retval = nfs_direct_read(iocb, (unsigned long) buf, count, pos,
						pages, page_count);
L
Linus Torvalds 已提交
799
	if (retval > 0)
800
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
801 802 803 804 805 806 807 808 809

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
 * @buf: user's buffer from which to write data
810 811
 * @count: number of bytes to write
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We also avoid an unnecessary invocation of generic_osync_inode(),
 * as it is fairly meaningless to sync the metadata of an NFS file.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
831
ssize_t nfs_file_direct_write(struct kiocb *iocb, const char __user *buf, size_t count, loff_t pos)
L
Linus Torvalds 已提交
832
{
833
	ssize_t retval;
T
Trond Myklebust 已提交
834 835
	int page_count;
	struct page **pages;
L
Linus Torvalds 已提交
836 837 838
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;

839
	dfprintk(VFS, "nfs: direct write(%s/%s, %lu@%Ld)\n",
840
		file->f_dentry->d_parent->d_name.name,
841 842
		file->f_dentry->d_name.name,
		(unsigned long) count, (long long) pos);
L
Linus Torvalds 已提交
843

844 845
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
846
		goto out;
847 848 849

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
850 851 852 853
		goto out;
	retval = 0;
	if (!count)
		goto out;
854 855

	retval = -EFAULT;
856
	if (!access_ok(VERIFY_READ, buf, count))
857
		goto out;
L
Linus Torvalds 已提交
858

T
Trond Myklebust 已提交
859 860 861
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
862

T
Trond Myklebust 已提交
863 864 865 866 867 868 869 870
	retval = nfs_get_user_pages(WRITE, (unsigned long) buf,
						count, &pages);
	if (retval < 0)
		goto out;
	page_count = retval;

	retval = nfs_direct_write(iocb, (unsigned long) buf, count,
					pos, pages, page_count);
871 872 873 874 875 876 877 878 879

	/*
	 * XXX: nfs_end_data_update() already ensures this file's
	 *      cached data is subsequently invalidated.  Do we really
	 *      need to call invalidate_inode_pages2() again here?
	 *
	 *      For aio writes, this invalidation will almost certainly
	 *      occur before the writes complete.  Kind of racey.
	 */
L
Linus Torvalds 已提交
880 881
	if (mapping->nrpages)
		invalidate_inode_pages2(mapping);
882

L
Linus Torvalds 已提交
883
	if (retval > 0)
884
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
885 886 887 888 889

out:
	return retval;
}

890 891 892 893
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
D
David Howells 已提交
894
int __init nfs_init_directcache(void)
L
Linus Torvalds 已提交
895 896 897
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
898 899
						0, (SLAB_RECLAIM_ACCOUNT|
							SLAB_MEM_SPREAD),
L
Linus Torvalds 已提交
900 901 902 903 904 905 906
						NULL, NULL);
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

907
/**
D
David Howells 已提交
908
 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
909 910
 *
 */
911
void nfs_destroy_directcache(void)
L
Linus Torvalds 已提交
912 913 914 915
{
	if (kmem_cache_destroy(nfs_direct_cachep))
		printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n");
}