direct.c 28.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9
/*
 * linux/fs/nfs/direct.c
 *
 * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
 *
 * High-performance uncached I/O for the Linux NFS client
 *
 * There are important applications whose performance or correctness
 * depends on uncached access to file data.  Database clusters
10
 * (multiple copies of the same instance running on separate hosts)
L
Linus Torvalds 已提交
11
 * implement their own cache coherency protocol that subsumes file
12 13 14
 * system cache protocols.  Applications that process datasets
 * considerably larger than the client's memory do not always benefit
 * from a local cache.  A streaming video server, for instance, has no
L
Linus Torvalds 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
 * need to cache the contents of a file.
 *
 * When an application requests uncached I/O, all read and write requests
 * are made directly to the server; data stored or fetched via these
 * requests is not cached in the Linux page cache.  The client does not
 * correct unaligned requests from applications.  All requested bytes are
 * held on permanent storage before a direct write system call returns to
 * an application.
 *
 * Solaris implements an uncached I/O facility called directio() that
 * is used for backups and sequential I/O to very large files.  Solaris
 * also supports uncaching whole NFS partitions with "-o forcedirectio,"
 * an undocumented mount option.
 *
 * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
 * help from Andrew Morton.
 *
 * 18 Dec 2001	Initial implementation for 2.4  --cel
 * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
 * 08 Jun 2003	Port to 2.5 APIs  --cel
 * 31 Mar 2004	Handle direct I/O without VFS support  --cel
 * 15 Sep 2004	Parallel async reads  --cel
37
 * 04 May 2005	support O_DIRECT with aio  --cel
L
Linus Torvalds 已提交
38 39 40 41 42 43 44 45 46
 *
 */

#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/file.h>
#include <linux/pagemap.h>
#include <linux/kref.h>
47
#include <linux/slab.h>
48
#include <linux/task_io_accounting_ops.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54

#include <linux/nfs_fs.h>
#include <linux/nfs_page.h>
#include <linux/sunrpc/clnt.h>

#include <asm/uaccess.h>
A
Arun Sharma 已提交
55
#include <linux/atomic.h>
L
Linus Torvalds 已提交
56

57
#include "internal.h"
C
Chuck Lever 已提交
58 59
#include "iostat.h"

L
Linus Torvalds 已提交
60 61
#define NFSDBG_FACILITY		NFSDBG_VFS

62
static struct kmem_cache *nfs_direct_cachep;
L
Linus Torvalds 已提交
63 64 65 66 67 68

/*
 * This represents a set of asynchronous requests that we're waiting on
 */
struct nfs_direct_req {
	struct kref		kref;		/* release manager */
69 70

	/* I/O parameters */
71
	struct nfs_open_context	*ctx;		/* file open context info */
72
	struct nfs_lock_context *l_ctx;		/* Lock context info */
73
	struct kiocb *		iocb;		/* controlling i/o request */
74
	struct inode *		inode;		/* target file of i/o */
75 76

	/* completion state */
77
	atomic_t		io_count;	/* i/os we're waiting for */
78 79
	spinlock_t		lock;		/* protect completion state */
	ssize_t			count,		/* bytes actually processed */
L
Linus Torvalds 已提交
80
				error;		/* any reported error */
81
	struct completion	completion;	/* wait for i/o completion */
82 83

	/* commit state */
84
	struct list_head	rewrite_list;	/* saved nfs_write_data structs */
85
	struct nfs_commit_data *commit_data;	/* special write_data for commits */
86 87 88 89
	int			flags;
#define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
	struct nfs_writeverf	verf;		/* unstable write verifier */
L
Linus Torvalds 已提交
90 91
};

92
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
93 94 95 96 97 98 99 100 101 102 103 104
static const struct rpc_call_ops nfs_write_direct_ops;

static inline void get_dreq(struct nfs_direct_req *dreq)
{
	atomic_inc(&dreq->io_count);
}

static inline int put_dreq(struct nfs_direct_req *dreq)
{
	return atomic_dec_and_test(&dreq->io_count);
}

L
Linus Torvalds 已提交
105
/**
106 107 108 109 110 111 112 113 114 115 116
 * nfs_direct_IO - NFS address space operation for direct I/O
 * @rw: direction (read or write)
 * @iocb: target I/O control block
 * @iov: array of vectors that define I/O buffer
 * @pos: offset in file to begin the operation
 * @nr_segs: size of iovec array
 *
 * The presence of this routine in the address space ops vector means
 * the NFS client supports direct I/O.  However, we shunt off direct
 * read and write requests before the VFS gets them, so this method
 * should never be called.
L
Linus Torvalds 已提交
117
 */
118 119 120
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
{
	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
121
			iocb->ki_filp->f_path.dentry->d_name.name,
122
			(long long) pos, nr_segs);
123 124 125 126

	return -EINVAL;
}

127
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
128
{
129
	unsigned int i;
130 131
	for (i = 0; i < npages; i++)
		page_cache_release(pages[i]);
132 133
}

134
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
L
Linus Torvalds 已提交
135
{
136 137
	struct nfs_direct_req *dreq;

138
	dreq = kmem_cache_alloc(nfs_direct_cachep, GFP_KERNEL);
139 140 141 142
	if (!dreq)
		return NULL;

	kref_init(&dreq->kref);
143
	kref_get(&dreq->kref);
144
	init_completion(&dreq->completion);
145
	INIT_LIST_HEAD(&dreq->rewrite_list);
146
	dreq->iocb = NULL;
147
	dreq->ctx = NULL;
148
	dreq->l_ctx = NULL;
149
	spin_lock_init(&dreq->lock);
150
	atomic_set(&dreq->io_count, 0);
151 152
	dreq->count = 0;
	dreq->error = 0;
153
	dreq->flags = 0;
154 155

	return dreq;
L
Linus Torvalds 已提交
156 157
}

158
static void nfs_direct_req_free(struct kref *kref)
L
Linus Torvalds 已提交
159 160
{
	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
161

162 163
	if (dreq->l_ctx != NULL)
		nfs_put_lock_context(dreq->l_ctx);
164 165
	if (dreq->ctx != NULL)
		put_nfs_open_context(dreq->ctx);
L
Linus Torvalds 已提交
166 167 168
	kmem_cache_free(nfs_direct_cachep, dreq);
}

169 170 171 172 173
static void nfs_direct_req_release(struct nfs_direct_req *dreq)
{
	kref_put(&dreq->kref, nfs_direct_req_free);
}

174 175 176 177 178
/*
 * Collects and returns the final error value/byte-count.
 */
static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
{
179
	ssize_t result = -EIOCBQUEUED;
180 181 182 183 184

	/* Async requests don't wait here */
	if (dreq->iocb)
		goto out;

185
	result = wait_for_completion_killable(&dreq->completion);
186 187

	if (!result)
188
		result = dreq->error;
189
	if (!result)
190
		result = dreq->count;
191 192 193 194 195

out:
	return (ssize_t) result;
}

196
/*
197 198
 * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
 * the iocb is still valid here if this is a synchronous request.
199 200 201 202
 */
static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
	if (dreq->iocb) {
203
		long res = (long) dreq->error;
204
		if (!res)
205
			res = (long) dreq->count;
206
		aio_complete(dreq->iocb, res, 0);
207 208
	}
	complete_all(&dreq->completion);
209

210
	nfs_direct_req_release(dreq);
211 212
}

213
void nfs_direct_readpage_release(struct nfs_page *req)
L
Linus Torvalds 已提交
214
{
215 216 217 218 219 220
	dprintk("NFS: direct read done (%s/%lld %d@%lld)\n",
		req->wb_context->dentry->d_inode->i_sb->s_id,
		(long long)NFS_FILEID(req->wb_context->dentry->d_inode),
		req->wb_bytes,
		(long long)req_offset(req));
	nfs_release_request(req);
221 222
}

223
static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
224
{
225 226
	unsigned long bytes = 0;
	struct nfs_direct_req *dreq = hdr->dreq;
227

228 229
	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
		goto out_put;
230 231

	spin_lock(&dreq->lock);
232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
	if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
		dreq->error = hdr->error;
	else
		dreq->count += hdr->good_bytes;
	spin_unlock(&dreq->lock);

	if (!test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
		while (!list_empty(&hdr->pages)) {
			struct nfs_page *req = nfs_list_entry(hdr->pages.next);
			struct page *page = req->wb_page;

			if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) {
				if (bytes > hdr->good_bytes)
					zero_user(page, 0, PAGE_SIZE);
				else if (hdr->good_bytes - bytes < PAGE_SIZE)
					zero_user_segment(page,
						hdr->good_bytes & ~PAGE_MASK,
						PAGE_SIZE);
			}
			bytes += req->wb_bytes;
			nfs_list_remove_request(req);
			nfs_direct_readpage_release(req);
			if (!PageCompound(page))
				set_page_dirty(page);
			page_cache_release(page);
		}
258
	} else {
259 260 261 262 263 264 265 266 267 268 269
		while (!list_empty(&hdr->pages)) {
			struct nfs_page *req = nfs_list_entry(hdr->pages.next);

			if (bytes < hdr->good_bytes)
				if (!PageCompound(req->wb_page))
					set_page_dirty(req->wb_page);
			bytes += req->wb_bytes;
			page_cache_release(req->wb_page);
			nfs_list_remove_request(req);
			nfs_direct_readpage_release(req);
		}
270
	}
271
out_put:
272 273
	if (put_dreq(dreq))
		nfs_direct_complete(dreq);
274
	hdr->release(hdr);
L
Linus Torvalds 已提交
275 276
}

277
static void nfs_sync_pgio_error(struct list_head *head)
278
{
279
	struct nfs_page *req;
280

281 282 283 284 285
	while (!list_empty(head)) {
		req = nfs_list_entry(head->next);
		nfs_list_remove_request(req);
		nfs_release_request(req);
	}
286 287
}

288 289 290 291 292 293 294 295 296 297 298
static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
{
	get_dreq(hdr->dreq);
}

static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
	.error_cleanup = nfs_sync_pgio_error,
	.init_hdr = nfs_direct_pgio_init,
	.completion = nfs_direct_read_completion,
};

299
/*
300 301 302 303 304
 * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
 * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
 * bail and stop sending more reads.  Read length accounting is
 * handled automatically by nfs_direct_read_result().  Otherwise, if
 * no requests have been sent, just return an error.
L
Linus Torvalds 已提交
305
 */
306
static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc,
307 308
						const struct iovec *iov,
						loff_t pos)
L
Linus Torvalds 已提交
309
{
310
	struct nfs_direct_req *dreq = desc->pg_dreq;
311
	struct nfs_open_context *ctx = dreq->ctx;
312
	struct inode *inode = ctx->dentry->d_inode;
313 314
	unsigned long user_addr = (unsigned long)iov->iov_base;
	size_t count = iov->iov_len;
315
	size_t rsize = NFS_SERVER(inode)->rsize;
316 317 318
	unsigned int pgbase;
	int result;
	ssize_t started = 0;
319 320
	struct page **pagevec = NULL;
	unsigned int npages;
321

L
Linus Torvalds 已提交
322
	do {
323
		size_t bytes;
324
		int i;
L
Linus Torvalds 已提交
325

326
		pgbase = user_addr & ~PAGE_MASK;
327
		bytes = min(max(rsize, PAGE_SIZE), count);
328

329
		result = -ENOMEM;
330 331 332 333 334
		npages = nfs_page_array_len(pgbase, bytes);
		if (!pagevec)
			pagevec = kmalloc(npages * sizeof(struct page *),
					  GFP_KERNEL);
		if (!pagevec)
335
			break;
336 337
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
338
					npages, 1, 0, pagevec, NULL);
339
		up_read(&current->mm->mmap_sem);
340
		if (result < 0)
341
			break;
342
		if ((unsigned)result < npages) {
343 344
			bytes = result * PAGE_SIZE;
			if (bytes <= pgbase) {
345
				nfs_direct_release_pages(pagevec, result);
346 347 348
				break;
			}
			bytes -= pgbase;
349
			npages = result;
350 351
		}

352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
		for (i = 0; i < npages; i++) {
			struct nfs_page *req;
			unsigned int req_len = min(bytes, PAGE_SIZE - pgbase);
			/* XXX do we need to do the eof zeroing found in async_filler? */
			req = nfs_create_request(dreq->ctx, dreq->inode,
						 pagevec[i],
						 pgbase, req_len);
			if (IS_ERR(req)) {
				nfs_direct_release_pages(pagevec + i,
							 npages - i);
				result = PTR_ERR(req);
				break;
			}
			req->wb_index = pos >> PAGE_SHIFT;
			req->wb_offset = pos & ~PAGE_MASK;
			if (!nfs_pageio_add_request(desc, req)) {
				result = desc->pg_error;
				nfs_release_request(req);
				nfs_direct_release_pages(pagevec + i,
							 npages - i);
				break;
			}
			pgbase = 0;
			bytes -= req_len;
			started += req_len;
			user_addr += req_len;
			pos += req_len;
			count -= req_len;
		}
L
Linus Torvalds 已提交
381
	} while (count != 0);
382

383 384
	kfree(pagevec);

385
	if (started)
386
		return started;
387
	return result < 0 ? (ssize_t) result : -EFAULT;
L
Linus Torvalds 已提交
388 389
}

390 391 392 393 394
static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
					      const struct iovec *iov,
					      unsigned long nr_segs,
					      loff_t pos)
{
395
	struct nfs_pageio_descriptor desc;
396 397 398 399
	ssize_t result = -EINVAL;
	size_t requested_bytes = 0;
	unsigned long seg;

400 401
	nfs_pageio_init_read(&desc, dreq->inode,
			     &nfs_direct_read_completion_ops);
402
	get_dreq(dreq);
403
	desc.pg_dreq = dreq;
404 405 406

	for (seg = 0; seg < nr_segs; seg++) {
		const struct iovec *vec = &iov[seg];
407
		result = nfs_direct_read_schedule_segment(&desc, vec, pos);
408 409 410 411 412 413 414 415
		if (result < 0)
			break;
		requested_bytes += result;
		if ((size_t)result < vec->iov_len)
			break;
		pos += vec->iov_len;
	}

416 417
	nfs_pageio_complete(&desc);

418 419 420 421 422 423 424 425 426
	/*
	 * If no bytes were started, return the error, and let the
	 * generic layer handle the completion.
	 */
	if (requested_bytes == 0) {
		nfs_direct_req_release(dreq);
		return result < 0 ? result : -EIO;
	}

427 428
	if (put_dreq(dreq))
		nfs_direct_complete(dreq);
429
	return 0;
430 431
}

432 433
static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
			       unsigned long nr_segs, loff_t pos)
L
Linus Torvalds 已提交
434
{
435
	ssize_t result = -ENOMEM;
436
	struct inode *inode = iocb->ki_filp->f_mapping->host;
L
Linus Torvalds 已提交
437 438
	struct nfs_direct_req *dreq;

439
	dreq = nfs_direct_req_alloc();
440 441
	if (dreq == NULL)
		goto out;
L
Linus Torvalds 已提交
442

C
Chuck Lever 已提交
443
	dreq->inode = inode;
444
	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
445 446 447
	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
	if (dreq->l_ctx == NULL)
		goto out_release;
448 449
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
450

451
	result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos);
452 453
	if (!result)
		result = nfs_direct_wait(dreq);
454
out_release:
455
	nfs_direct_req_release(dreq);
456
out:
L
Linus Torvalds 已提交
457 458 459
	return result;
}

460 461 462 463
static void nfs_direct_writehdr_release(struct nfs_write_header *whdr)
{
	struct nfs_write_data *data = &whdr->rpc_data;

F
Fred Isaman 已提交
464 465
	if (data->pages.pagevec != data->pages.page_array)
		kfree(data->pages.pagevec);
466 467 468
	nfs_writehdr_free(&whdr->header);
}

469
static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
470
{
471
	while (!list_empty(&dreq->rewrite_list)) {
472 473
		struct nfs_pgio_header *hdr = list_entry(dreq->rewrite_list.next, struct nfs_pgio_header, pages);
		struct nfs_write_header *whdr = container_of(hdr, struct nfs_write_header, header);
F
Fred Isaman 已提交
474 475
		struct nfs_page_array *p = &whdr->rpc_data.pages;

476
		list_del(&hdr->pages);
F
Fred Isaman 已提交
477
		nfs_direct_release_pages(p->pagevec, p->npages);
478
		nfs_direct_writehdr_release(whdr);
479 480
	}
}
L
Linus Torvalds 已提交
481

482 483 484
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
485 486 487
	struct inode *inode = dreq->inode;
	struct list_head *p;
	struct nfs_write_data *data;
488
	struct nfs_pgio_header *hdr;
489
	struct rpc_task *task;
490 491 492
	struct rpc_message msg = {
		.rpc_cred = dreq->ctx->cred,
	};
493 494
	struct rpc_task_setup task_setup_data = {
		.rpc_client = NFS_CLIENT(inode),
T
Terry Loftin 已提交
495
		.rpc_message = &msg,
496
		.callback_ops = &nfs_write_direct_ops,
497
		.workqueue = nfsiod_workqueue,
498 499
		.flags = RPC_TASK_ASYNC,
	};
L
Linus Torvalds 已提交
500

501
	dreq->count = 0;
502 503 504
	get_dreq(dreq);

	list_for_each(p, &dreq->rewrite_list) {
505 506
		hdr = list_entry(p, struct nfs_pgio_header, pages);
		data = &(container_of(hdr, struct nfs_write_header, header))->rpc_data;
507 508 509

		get_dreq(dreq);

510 511 512
		/* Use stable writes */
		data->args.stable = NFS_FILE_SYNC;

513 514 515 516 517 518 519 520 521 522 523
		/*
		 * Reset data->res.
		 */
		nfs_fattr_init(&data->fattr);
		data->res.count = data->args.count;
		memset(&data->verf, 0, sizeof(data->verf));

		/*
		 * Reuse data->task; data->args should not have changed
		 * since the original request was sent.
		 */
524
		task_setup_data.task = &data->task;
525
		task_setup_data.callback_data = data;
526 527 528
		msg.rpc_argp = &data->args;
		msg.rpc_resp = &data->res;
		NFS_PROTO(inode)->write_setup(data, &msg);
529 530 531 532

		/*
		 * We're called via an RPC callback, so BKL is already held.
		 */
533 534 535
		task = rpc_run_task(&task_setup_data);
		if (!IS_ERR(task))
			rpc_put_task(task);
536 537 538 539 540 541 542 543

		dprintk("NFS: %5u rescheduled direct write call (req %s/%Ld, %u bytes @ offset %Lu)\n",
				data->task.tk_pid,
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				data->args.count,
				(unsigned long long)data->args.offset);
	}
544

545 546
	if (put_dreq(dreq))
		nfs_direct_write_complete(dreq, inode);
547 548 549 550
}

static void nfs_direct_commit_result(struct rpc_task *task, void *calldata)
{
551
	struct nfs_commit_data *data = calldata;
552 553

	/* Call the NFS version-specific code */
554 555 556 557 558
	NFS_PROTO(data->inode)->commit_done(task, data);
}

static void nfs_direct_commit_release(void *calldata)
{
559 560
	struct nfs_commit_data *data = calldata;
	struct nfs_direct_req *dreq = data->dreq;
561 562 563
	int status = data->task.tk_status;

	if (status < 0) {
564
		dprintk("NFS: %5u commit failed with error %d.\n",
565
				data->task.tk_pid, status);
566
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
567
	} else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) {
568
		dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid);
569
		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
L
Linus Torvalds 已提交
570 571
	}

572
	dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
573
	nfs_direct_write_complete(dreq, data->inode);
T
Trond Myklebust 已提交
574
	nfs_commit_free(data);
L
Linus Torvalds 已提交
575 576
}

577
static const struct rpc_call_ops nfs_commit_direct_ops = {
578
	.rpc_call_prepare = nfs_commit_prepare,
579
	.rpc_call_done = nfs_direct_commit_result,
580
	.rpc_release = nfs_direct_commit_release,
581 582 583
};

static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
L
Linus Torvalds 已提交
584
{
585
	struct nfs_commit_data *data = dreq->commit_data;
586
	struct rpc_task *task;
587 588 589 590 591
	struct rpc_message msg = {
		.rpc_argp = &data->args,
		.rpc_resp = &data->res,
		.rpc_cred = dreq->ctx->cred,
	};
592
	struct rpc_task_setup task_setup_data = {
593
		.task = &data->task,
594
		.rpc_client = NFS_CLIENT(dreq->inode),
595
		.rpc_message = &msg,
596 597
		.callback_ops = &nfs_commit_direct_ops,
		.callback_data = data,
598
		.workqueue = nfsiod_workqueue,
599 600
		.flags = RPC_TASK_ASYNC,
	};
L
Linus Torvalds 已提交
601

602
	data->inode = dreq->inode;
603
	data->cred = msg.rpc_cred;
L
Linus Torvalds 已提交
604

605
	data->args.fh = NFS_FH(data->inode);
606 607
	data->args.offset = 0;
	data->args.count = 0;
608 609
	data->res.fattr = &data->fattr;
	data->res.verf = &data->verf;
610
	nfs_fattr_init(&data->fattr);
L
Linus Torvalds 已提交
611

612
	NFS_PROTO(data->inode)->commit_setup(data, &msg);
L
Linus Torvalds 已提交
613

614 615
	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
	dreq->commit_data = NULL;
L
Linus Torvalds 已提交
616

617
	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
L
Linus Torvalds 已提交
618

619 620 621
	task = rpc_run_task(&task_setup_data);
	if (!IS_ERR(task))
		rpc_put_task(task);
622
}
L
Linus Torvalds 已提交
623

624 625 626
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	int flags = dreq->flags;
L
Linus Torvalds 已提交
627

628 629 630 631
	dreq->flags = 0;
	switch (flags) {
		case NFS_ODIRECT_DO_COMMIT:
			nfs_direct_commit_schedule(dreq);
L
Linus Torvalds 已提交
632
			break;
633 634 635 636 637 638 639
		case NFS_ODIRECT_RESCHED_WRITES:
			nfs_direct_write_reschedule(dreq);
			break;
		default:
			if (dreq->commit_data != NULL)
				nfs_commit_free(dreq->commit_data);
			nfs_direct_free_writedata(dreq);
640
			nfs_zap_mapping(inode, inode->i_mapping);
641 642 643
			nfs_direct_complete(dreq);
	}
}
L
Linus Torvalds 已提交
644

645 646
static void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
647
	dreq->commit_data = nfs_commitdata_alloc();
648
	if (dreq->commit_data != NULL)
649
		dreq->commit_data->dreq = dreq;
650 651 652 653 654 655
}
#else
static inline void nfs_alloc_commit_data(struct nfs_direct_req *dreq)
{
	dreq->commit_data = NULL;
}
L
Linus Torvalds 已提交
656

657 658 659
static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
{
	nfs_direct_free_writedata(dreq);
660
	nfs_zap_mapping(inode, inode->i_mapping);
661 662 663
	nfs_direct_complete(dreq);
}
#endif
L
Linus Torvalds 已提交
664

665
static void nfs_direct_write_result(struct rpc_task *task, void *calldata)
L
Linus Torvalds 已提交
666
{
667 668
	struct nfs_write_data *data = calldata;

669
	nfs_writeback_done(task, data);
670 671 672 673 674 675 676 677 678
}

/*
 * NB: Return the value of the first error return code.  Subsequent
 *     errors after the first one are ignored.
 */
static void nfs_direct_write_release(void *calldata)
{
	struct nfs_write_data *data = calldata;
679 680
	struct nfs_pgio_header *hdr = data->header;
	struct nfs_direct_req *dreq = (struct nfs_direct_req *) hdr->req;
681
	int status = data->task.tk_status;
682

683
	spin_lock(&dreq->lock);
L
Linus Torvalds 已提交
684

685
	if (unlikely(status < 0)) {
686
		/* An error has occurred, so we should not commit */
687
		dreq->flags = 0;
688 689
		dreq->error = status;
	}
690 691
	if (unlikely(dreq->error != 0))
		goto out_unlock;
692 693

	dreq->count += data->res.count;
L
Linus Torvalds 已提交
694

695 696 697 698 699
	if (data->res.verf->committed != NFS_FILE_SYNC) {
		switch (dreq->flags) {
			case 0:
				memcpy(&dreq->verf, &data->verf, sizeof(dreq->verf));
				dreq->flags = NFS_ODIRECT_DO_COMMIT;
L
Linus Torvalds 已提交
700
				break;
701 702
			case NFS_ODIRECT_DO_COMMIT:
				if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) {
703
					dprintk("NFS: %5u write verify failed\n", data->task.tk_pid);
704 705
					dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
				}
L
Linus Torvalds 已提交
706 707
		}
	}
708
out_unlock:
709
	spin_unlock(&dreq->lock);
L
Linus Torvalds 已提交
710

711
	if (put_dreq(dreq))
712
		nfs_direct_write_complete(dreq, hdr->inode);
713 714 715
}

static const struct rpc_call_ops nfs_write_direct_ops = {
716
	.rpc_call_prepare = nfs_write_prepare,
717
	.rpc_call_done = nfs_direct_write_result,
718
	.rpc_release = nfs_direct_write_release,
719 720 721
};

/*
722 723 724 725 726
 * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
 * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
 * bail and stop sending more writes.  Write length accounting is
 * handled automatically by nfs_direct_write_result().  Otherwise, if
 * no requests have been sent, just return an error.
727
 */
728 729 730
static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
						 const struct iovec *iov,
						 loff_t pos, int sync)
731
{
732
	struct nfs_open_context *ctx = dreq->ctx;
733
	struct inode *inode = ctx->dentry->d_inode;
734 735
	unsigned long user_addr = (unsigned long)iov->iov_base;
	size_t count = iov->iov_len;
736
	struct rpc_task *task;
737 738 739
	struct rpc_message msg = {
		.rpc_cred = ctx->cred,
	};
740 741
	struct rpc_task_setup task_setup_data = {
		.rpc_client = NFS_CLIENT(inode),
742
		.rpc_message = &msg,
743
		.callback_ops = &nfs_write_direct_ops,
744
		.workqueue = nfsiod_workqueue,
745 746
		.flags = RPC_TASK_ASYNC,
	};
747
	size_t wsize = NFS_SERVER(inode)->wsize;
748 749 750
	unsigned int pgbase;
	int result;
	ssize_t started = 0;
751

L
Linus Torvalds 已提交
752
	do {
753
		struct nfs_write_header *whdr;
754
		struct nfs_write_data *data;
F
Fred Isaman 已提交
755
		struct nfs_page_array *pages;
756 757
		size_t bytes;

758 759 760
		pgbase = user_addr & ~PAGE_MASK;
		bytes = min(wsize,count);

761
		result = -ENOMEM;
762
		whdr = nfs_writehdr_alloc();
763
		if (unlikely(!whdr))
764 765
			break;

766 767 768 769 770 771 772
		data = nfs_writedata_alloc(&whdr->header, nfs_page_array_len(pgbase, bytes));
		if (!data) {
			nfs_writehdr_free(&whdr->header);
			break;
		}
		data->header = &whdr->header;
		atomic_inc(&data->header->refcnt);
F
Fred Isaman 已提交
773
		pages = &data->pages;
774

775 776
		down_read(&current->mm->mmap_sem);
		result = get_user_pages(current, current->mm, user_addr,
F
Fred Isaman 已提交
777
					pages->npages, 0, 0, pages->pagevec, NULL);
778
		up_read(&current->mm->mmap_sem);
779
		if (result < 0) {
780
			nfs_direct_writehdr_release(whdr);
781 782
			break;
		}
F
Fred Isaman 已提交
783
		if ((unsigned)result < pages->npages) {
784 785
			bytes = result * PAGE_SIZE;
			if (bytes <= pgbase) {
F
Fred Isaman 已提交
786
				nfs_direct_release_pages(pages->pagevec, result);
787
				nfs_direct_writehdr_release(whdr);
788 789 790
				break;
			}
			bytes -= pgbase;
F
Fred Isaman 已提交
791
			pages->npages = result;
792 793 794 795
		}

		get_dreq(dreq);

796
		list_move_tail(&whdr->header.pages, &dreq->rewrite_list);
797

798 799 800
		whdr->header.req = (struct nfs_page *) dreq;
		whdr->header.inode = inode;
		whdr->header.cred = msg.rpc_cred;
801
		data->args.fh = NFS_FH(inode);
T
Trond Myklebust 已提交
802
		data->args.context = ctx;
803
		data->args.lock_context = dreq->l_ctx;
804
		data->args.offset = pos;
805
		data->args.pgbase = pgbase;
F
Fred Isaman 已提交
806
		data->args.pages = pages->pagevec;
807
		data->args.count = bytes;
808
		data->args.stable = sync;
809 810
		data->res.fattr = &data->fattr;
		data->res.count = bytes;
811
		data->res.verf = &data->verf;
812
		nfs_fattr_init(&data->fattr);
813

814
		task_setup_data.task = &data->task;
815
		task_setup_data.callback_data = data;
816 817 818
		msg.rpc_argp = &data->args;
		msg.rpc_resp = &data->res;
		NFS_PROTO(inode)->write_setup(data, &msg);
L
Linus Torvalds 已提交
819

820
		task = rpc_run_task(&task_setup_data);
821 822
		if (IS_ERR(task))
			break;
L
Linus Torvalds 已提交
823

C
Chuck Lever 已提交
824 825
		dprintk("NFS: %5u initiated direct write call "
			"(req %s/%Ld, %zu bytes @ offset %Lu)\n",
826
				task->tk_pid,
827 828 829 830
				inode->i_sb->s_id,
				(long long)NFS_FILEID(inode),
				bytes,
				(unsigned long long)data->args.offset);
831
		rpc_put_task(task);
L
Linus Torvalds 已提交
832

833 834
		started += bytes;
		user_addr += bytes;
835
		pos += bytes;
836 837

		/* FIXME: Remove this useless math from the final patch */
838 839
		pgbase += bytes;
		pgbase &= ~PAGE_MASK;
840
		BUG_ON(pgbase != (user_addr & ~PAGE_MASK));
L
Linus Torvalds 已提交
841

842 843
		count -= bytes;
	} while (count != 0);
844 845

	if (started)
846
		return started;
847
	return result < 0 ? (ssize_t) result : -EFAULT;
848
}
L
Linus Torvalds 已提交
849

850 851 852 853 854 855 856 857 858 859 860 861 862
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
					       const struct iovec *iov,
					       unsigned long nr_segs,
					       loff_t pos, int sync)
{
	ssize_t result = 0;
	size_t requested_bytes = 0;
	unsigned long seg;

	get_dreq(dreq);

	for (seg = 0; seg < nr_segs; seg++) {
		const struct iovec *vec = &iov[seg];
863 864
		result = nfs_direct_write_schedule_segment(dreq, vec,
							   pos, sync);
865 866 867 868 869 870 871 872
		if (result < 0)
			break;
		requested_bytes += result;
		if ((size_t)result < vec->iov_len)
			break;
		pos += vec->iov_len;
	}

873 874 875 876 877 878 879 880 881
	/*
	 * If no bytes were started, return the error, and let the
	 * generic layer handle the completion.
	 */
	if (requested_bytes == 0) {
		nfs_direct_req_release(dreq);
		return result < 0 ? result : -EIO;
	}

882 883
	if (put_dreq(dreq))
		nfs_direct_write_complete(dreq, dreq->inode);
884
	return 0;
885 886
}

887 888 889
static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
				unsigned long nr_segs, loff_t pos,
				size_t count)
890
{
891
	ssize_t result = -ENOMEM;
892
	struct inode *inode = iocb->ki_filp->f_mapping->host;
893
	struct nfs_direct_req *dreq;
894
	size_t wsize = NFS_SERVER(inode)->wsize;
895
	int sync = NFS_UNSTABLE;
L
Linus Torvalds 已提交
896

897
	dreq = nfs_direct_req_alloc();
898
	if (!dreq)
899
		goto out;
900 901
	nfs_alloc_commit_data(dreq);

902
	if (dreq->commit_data == NULL || count <= wsize)
903
		sync = NFS_FILE_SYNC;
L
Linus Torvalds 已提交
904

905
	dreq->inode = inode;
906
	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
907
	dreq->l_ctx = nfs_get_lock_context(dreq->ctx);
908
	if (dreq->l_ctx == NULL)
909
		goto out_release;
910 911
	if (!is_sync_kiocb(iocb))
		dreq->iocb = iocb;
L
Linus Torvalds 已提交
912

913
	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync);
914 915
	if (!result)
		result = nfs_direct_wait(dreq);
916
out_release:
917
	nfs_direct_req_release(dreq);
918
out:
L
Linus Torvalds 已提交
919 920 921 922 923 924
	return result;
}

/**
 * nfs_file_direct_read - file direct read operation for NFS files
 * @iocb: target I/O control block
925 926
 * @iov: vector of user buffers into which to read data
 * @nr_segs: size of iov vector
927
 * @pos: byte offset in file where reading starts
L
Linus Torvalds 已提交
928 929 930 931 932 933
 *
 * We use this function for direct reads instead of calling
 * generic_file_aio_read() in order to avoid gfar's check to see if
 * the request starts before the end of the file.  For that check
 * to work, we must generate a GETATTR before each direct read, and
 * even then there is a window between the GETATTR and the subsequent
934
 * READ where the file size could change.  Our preference is simply
L
Linus Torvalds 已提交
935 936
 * to do all reads the application wants, and the server will take
 * care of managing the end of file boundary.
937
 *
L
Linus Torvalds 已提交
938 939 940 941 942
 * This function also eliminates unnecessarily updating the file's
 * atime locally, as the NFS server sets the file's atime, and this
 * client must read the updated atime from the server back into its
 * cache.
 */
943 944
ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
				unsigned long nr_segs, loff_t pos)
L
Linus Torvalds 已提交
945 946 947 948
{
	ssize_t retval = -EINVAL;
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
949 950 951 952
	size_t count;

	count = iov_length(iov, nr_segs);
	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
L
Linus Torvalds 已提交
953

C
Chuck Lever 已提交
954
	dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
955 956
		file->f_path.dentry->d_parent->d_name.name,
		file->f_path.dentry->d_name.name,
957
		count, (long long) pos);
L
Linus Torvalds 已提交
958 959 960 961 962

	retval = 0;
	if (!count)
		goto out;

T
Trond Myklebust 已提交
963 964 965
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
966

967 968
	task_io_account_read(count);

969
	retval = nfs_direct_read(iocb, iov, nr_segs, pos);
L
Linus Torvalds 已提交
970
	if (retval > 0)
971
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
972 973 974 975 976 977 978 979

out:
	return retval;
}

/**
 * nfs_file_direct_write - file direct write operation for NFS files
 * @iocb: target I/O control block
980 981
 * @iov: vector of user buffers from which to write data
 * @nr_segs: size of iov vector
982
 * @pos: byte offset in file where writing starts
L
Linus Torvalds 已提交
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998
 *
 * We use this function for direct writes instead of calling
 * generic_file_aio_write() in order to avoid taking the inode
 * semaphore and updating the i_size.  The NFS server will set
 * the new i_size and this client must read the updated size
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
 * readers of this file.
 *
 * Note that O_APPEND is not supported for NFS direct writes, as there
 * is no atomic O_APPEND write facility in the NFS protocol.
 */
999 1000
ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
				unsigned long nr_segs, loff_t pos)
L
Linus Torvalds 已提交
1001
{
C
Chuck Lever 已提交
1002
	ssize_t retval = -EINVAL;
L
Linus Torvalds 已提交
1003 1004
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
1005
	size_t count;
L
Linus Torvalds 已提交
1006

1007 1008 1009
	count = iov_length(iov, nr_segs);
	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);

C
Chuck Lever 已提交
1010
	dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
1011 1012
		file->f_path.dentry->d_parent->d_name.name,
		file->f_path.dentry->d_name.name,
1013
		count, (long long) pos);
1014

1015 1016
	retval = generic_write_checks(file, &pos, &count, 0);
	if (retval)
L
Linus Torvalds 已提交
1017
		goto out;
1018 1019 1020

	retval = -EINVAL;
	if ((ssize_t) count < 0)
L
Linus Torvalds 已提交
1021 1022 1023 1024
		goto out;
	retval = 0;
	if (!count)
		goto out;
1025

T
Trond Myklebust 已提交
1026 1027 1028
	retval = nfs_sync_mapping(mapping);
	if (retval)
		goto out;
L
Linus Torvalds 已提交
1029

1030 1031
	task_io_account_write(count);

1032
	retval = nfs_direct_write(iocb, iov, nr_segs, pos, count);
1033

L
Linus Torvalds 已提交
1034
	if (retval > 0)
1035
		iocb->ki_pos = pos + retval;
L
Linus Torvalds 已提交
1036 1037 1038 1039 1040

out:
	return retval;
}

1041 1042 1043 1044
/**
 * nfs_init_directcache - create a slab cache for nfs_direct_req structures
 *
 */
D
David Howells 已提交
1045
int __init nfs_init_directcache(void)
L
Linus Torvalds 已提交
1046 1047 1048
{
	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
						sizeof(struct nfs_direct_req),
1049 1050
						0, (SLAB_RECLAIM_ACCOUNT|
							SLAB_MEM_SPREAD),
1051
						NULL);
L
Linus Torvalds 已提交
1052 1053 1054 1055 1056 1057
	if (nfs_direct_cachep == NULL)
		return -ENOMEM;

	return 0;
}

1058
/**
D
David Howells 已提交
1059
 * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1060 1061
 *
 */
1062
void nfs_destroy_directcache(void)
L
Linus Torvalds 已提交
1063
{
1064
	kmem_cache_destroy(nfs_direct_cachep);
L
Linus Torvalds 已提交
1065
}