direct-io.c 18.8 KB
Newer Older
1 2 3
// SPDX-License-Identifier: GPL-2.0
/*
 * Copyright (C) 2010 Red Hat, Inc.
4
 * Copyright (c) 2016-2021 Christoph Hellwig.
5 6 7 8
 */
#include <linux/module.h>
#include <linux/compiler.h>
#include <linux/fs.h>
9
#include <linux/fscrypt.h>
10
#include <linux/pagemap.h>
11 12 13 14
#include <linux/iomap.h>
#include <linux/backing-dev.h>
#include <linux/uio.h>
#include <linux/task_io_accounting_ops.h>
15
#include "trace.h"
16 17 18 19 20 21 22 23 24 25 26 27 28 29

#include "../internal.h"

/*
 * Private flags for iomap_dio, must not overlap with the public ones in
 * iomap.h:
 */
#define IOMAP_DIO_WRITE_FUA	(1 << 28)
#define IOMAP_DIO_NEED_SYNC	(1 << 29)
#define IOMAP_DIO_WRITE		(1 << 30)
#define IOMAP_DIO_DIRTY		(1 << 31)

struct iomap_dio {
	struct kiocb		*iocb;
30
	const struct iomap_dio_ops *dops;
31 32 33 34 35
	loff_t			i_size;
	loff_t			size;
	atomic_t		ref;
	unsigned		flags;
	int			error;
36
	size_t			done_before;
37 38 39 40 41 42 43
	bool			wait_for_completion;

	union {
		/* used during submission and for synchronous completion: */
		struct {
			struct iov_iter		*iter;
			struct task_struct	*waiter;
44
			struct bio		*poll_bio;
45 46 47 48 49 50 51 52 53
		} submit;

		/* used for aio completion: */
		struct {
			struct work_struct	work;
		} aio;
	};
};

54 55
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
		struct iomap_dio *dio, struct bio *bio, loff_t pos)
56 57 58
{
	atomic_inc(&dio->ref);

59
	if (dio->iocb->ki_flags & IOCB_HIPRI) {
60
		bio_set_polled(bio, dio->iocb);
61 62
		dio->submit.poll_bio = bio;
	}
63

64
	if (dio->dops && dio->dops->submit_io)
65
		dio->dops->submit_io(iter, bio, pos);
66
	else
67
		submit_bio(bio);
68 69
}

70
ssize_t iomap_dio_complete(struct iomap_dio *dio)
71
{
72
	const struct iomap_dio_ops *dops = dio->dops;
73 74 75
	struct kiocb *iocb = dio->iocb;
	struct inode *inode = file_inode(iocb->ki_filp);
	loff_t offset = iocb->ki_pos;
76
	ssize_t ret = dio->error;
77

78 79
	if (dops && dops->end_io)
		ret = dops->end_io(iocb, dio->size, ret, dio->flags);
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96

	if (likely(!ret)) {
		ret = dio->size;
		/* check for short read */
		if (offset + ret > dio->i_size &&
		    !(dio->flags & IOMAP_DIO_WRITE))
			ret = dio->i_size - offset;
		iocb->ki_pos += ret;
	}

	/*
	 * Try again to invalidate clean pages which might have been cached by
	 * non-direct readahead, or faulted in by get_user_pages() if the source
	 * of the write was an mmap'ed region of the file we're writing.  Either
	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
	 * this invalidation fails, tough, the write still worked...
	 *
97 98 99
	 * And this page cache invalidation has to be after ->end_io(), as some
	 * filesystems convert unwritten extents to real allocations in
	 * ->end_io() when necessary, otherwise a racing buffer read would cache
100 101
	 * zeros from unwritten extents.
	 */
102
	if (!dio->error && dio->size &&
103 104 105 106 107 108 109 110 111
	    (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
		int err;
		err = invalidate_inode_pages2_range(inode->i_mapping,
				offset >> PAGE_SHIFT,
				(offset + dio->size - 1) >> PAGE_SHIFT);
		if (err)
			dio_warn_stale_pagecache(iocb->ki_filp);
	}

112
	inode_dio_end(file_inode(iocb->ki_filp));
113 114 115 116 117 118 119
	/*
	 * If this is a DSYNC write, make sure we push it to stable storage now
	 * that we've written data.
	 */
	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
		ret = generic_write_sync(iocb, ret);

120 121 122
	if (ret > 0)
		ret += dio->done_before;

123 124 125 126
	kfree(dio);

	return ret;
}
127
EXPORT_SYMBOL_GPL(iomap_dio_complete);
128 129 130 131 132 133

static void iomap_dio_complete_work(struct work_struct *work)
{
	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
	struct kiocb *iocb = dio->iocb;

134
	iocb->ki_complete(iocb, iomap_dio_complete(dio));
135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
}

/*
 * Set an error in the dio if none is set yet.  We have to use cmpxchg
 * as the submission context and the completion context(s) can race to
 * update the error.
 */
static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
{
	cmpxchg(&dio->error, 0, ret);
}

static void iomap_dio_bio_end_io(struct bio *bio)
{
	struct iomap_dio *dio = bio->bi_private;
	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);

	if (bio->bi_status)
		iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));

	if (atomic_dec_and_test(&dio->ref)) {
		if (dio->wait_for_completion) {
			struct task_struct *waiter = dio->submit.waiter;
			WRITE_ONCE(dio->submit.waiter, NULL);
			blk_wake_io_task(waiter);
		} else if (dio->flags & IOMAP_DIO_WRITE) {
			struct inode *inode = file_inode(dio->iocb->ki_filp);

163
			WRITE_ONCE(dio->iocb->private, NULL);
164 165 166
			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
		} else {
167
			WRITE_ONCE(dio->iocb->private, NULL);
168 169 170 171 172 173 174 175 176 177 178 179
			iomap_dio_complete_work(&dio->aio.work);
		}
	}

	if (should_dirty) {
		bio_check_pages_dirty(bio);
	} else {
		bio_release_pages(bio, false);
		bio_put(bio);
	}
}

180 181
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
		loff_t pos, unsigned len)
182
{
183
	struct inode *inode = file_inode(dio->iocb->ki_filp);
184 185 186 187
	struct page *page = ZERO_PAGE(0);
	int flags = REQ_SYNC | REQ_IDLE;
	struct bio *bio;

188
	bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
189 190
	fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
				  GFP_KERNEL);
191
	bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
192 193 194 195 196
	bio->bi_private = dio;
	bio->bi_end_io = iomap_dio_bio_end_io;

	get_page(page);
	__bio_add_page(bio, page, len, 0);
197
	iomap_dio_submit_bio(iter, dio, bio, pos);
198 199
}

N
Naohiro Aota 已提交
200 201 202 203 204
/*
 * Figure out the bio's operation flags from the dio request, the
 * mapping, and whether or not we want FUA.  Note that we can end up
 * clearing the WRITE_FUA flag in the dio request.
 */
205 206
static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio,
		const struct iomap *iomap, bool use_fua)
N
Naohiro Aota 已提交
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
{
	unsigned int opflags = REQ_SYNC | REQ_IDLE;

	if (!(dio->flags & IOMAP_DIO_WRITE)) {
		WARN_ON_ONCE(iomap->flags & IOMAP_F_ZONE_APPEND);
		return REQ_OP_READ;
	}

	if (iomap->flags & IOMAP_F_ZONE_APPEND)
		opflags |= REQ_OP_ZONE_APPEND;
	else
		opflags |= REQ_OP_WRITE;

	if (use_fua)
		opflags |= REQ_FUA;
	else
		dio->flags &= ~IOMAP_DIO_WRITE_FUA;

	return opflags;
}

228 229
static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
		struct iomap_dio *dio)
230
{
231 232
	const struct iomap *iomap = &iter->iomap;
	struct inode *inode = iter->inode;
233 234 235
	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
	unsigned int fs_block_size = i_blocksize(inode), pad;
	unsigned int align = iov_iter_alignment(dio->submit.iter);
236 237
	loff_t length = iomap_length(iter);
	loff_t pos = iter->pos;
N
Naohiro Aota 已提交
238
	unsigned int bio_opf;
239 240 241 242 243
	struct bio *bio;
	bool need_zeroout = false;
	bool use_fua = false;
	int nr_pages, ret = 0;
	size_t copied = 0;
244
	size_t orig_count;
245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273

	if ((pos | length | align) & ((1 << blkbits) - 1))
		return -EINVAL;

	if (iomap->type == IOMAP_UNWRITTEN) {
		dio->flags |= IOMAP_DIO_UNWRITTEN;
		need_zeroout = true;
	}

	if (iomap->flags & IOMAP_F_SHARED)
		dio->flags |= IOMAP_DIO_COW;

	if (iomap->flags & IOMAP_F_NEW) {
		need_zeroout = true;
	} else if (iomap->type == IOMAP_MAPPED) {
		/*
		 * Use a FUA write if we need datasync semantics, this is a pure
		 * data IO that doesn't require any metadata updates (including
		 * after IO completion such as unwritten extent conversion) and
		 * the underlying device supports FUA. This allows us to avoid
		 * cache flushes on IO completion.
		 */
		if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
		    (dio->flags & IOMAP_DIO_WRITE_FUA) &&
		    blk_queue_fua(bdev_get_queue(iomap->bdev)))
			use_fua = true;
	}

	/*
274 275 276
	 * Save the original count and trim the iter to just the extent we
	 * are operating on right now.  The iter will be re-expanded once
	 * we are done.
277
	 */
278 279
	orig_count = iov_iter_count(dio->submit.iter);
	iov_iter_truncate(dio->submit.iter, length);
280

281
	if (!iov_iter_count(dio->submit.iter))
282
		goto out;
283

284 285 286 287 288 289 290
	/*
	 * We can only poll for single bio I/Os.
	 */
	if (need_zeroout ||
	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode)))
		dio->iocb->ki_flags &= ~IOCB_HIPRI;

291 292 293 294
	if (need_zeroout) {
		/* zero out from the start of the block to the write offset */
		pad = pos & (fs_block_size - 1);
		if (pad)
295
			iomap_dio_zero(iter, dio, pos - pad, pad);
296 297
	}

N
Naohiro Aota 已提交
298 299 300 301 302 303 304
	/*
	 * Set the operation flags early so that bio_iov_iter_get_pages
	 * can set up the page vector appropriately for a ZONE_APPEND
	 * operation.
	 */
	bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);

305
	nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
306 307 308 309
	do {
		size_t n;
		if (dio->error) {
			iov_iter_revert(dio->submit.iter, copied);
310 311
			copied = ret = 0;
			goto out;
312 313
		}

314
		bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
315 316
		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
					  GFP_KERNEL);
317 318 319 320 321
		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
		bio->bi_ioprio = dio->iocb->ki_ioprio;
		bio->bi_private = dio;
		bio->bi_end_io = iomap_dio_bio_end_io;

322
		ret = bio_iov_iter_get_pages(bio, dio->submit.iter);
323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
		if (unlikely(ret)) {
			/*
			 * We have to stop part way through an IO. We must fall
			 * through to the sub-block tail zeroing here, otherwise
			 * this short IO may expose stale data in the tail of
			 * the block we haven't written data to.
			 */
			bio_put(bio);
			goto zero_tail;
		}

		n = bio->bi_iter.bi_size;
		if (dio->flags & IOMAP_DIO_WRITE) {
			task_io_account_write(n);
		} else {
			if (dio->flags & IOMAP_DIO_DIRTY)
				bio_set_pages_dirty(bio);
		}

		dio->size += n;
		copied += n;

345
		nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter,
346
						 BIO_MAX_VECS);
347 348 349 350 351
		/*
		 * We can only poll for single bio I/Os.
		 */
		if (nr_pages)
			dio->iocb->ki_flags &= ~IOCB_HIPRI;
352
		iomap_dio_submit_bio(iter, dio, bio, pos);
353
		pos += n;
354 355 356 357 358 359 360 361 362 363 364 365 366 367
	} while (nr_pages);

	/*
	 * We need to zeroout the tail of a sub-block write if the extent type
	 * requires zeroing or the write extends beyond EOF. If we don't zero
	 * the block tail in the latter case, we can expose stale data via mmap
	 * reads of the EOF block.
	 */
zero_tail:
	if (need_zeroout ||
	    ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
		/* zero out from the end of the write to the end of the block */
		pad = pos & (fs_block_size - 1);
		if (pad)
368
			iomap_dio_zero(iter, dio, pos, fs_block_size - pad);
369
	}
370 371 372
out:
	/* Undo iter limitation to current extent */
	iov_iter_reexpand(dio->submit.iter, orig_count - copied);
373 374 375
	if (copied)
		return copied;
	return ret;
376 377
}

378 379
static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter,
		struct iomap_dio *dio)
380
{
381 382
	loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter);

383
	dio->size += length;
384 385
	if (!length)
		return -EFAULT;
386 387 388
	return length;
}

389 390
static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi,
		struct iomap_dio *dio)
391
{
392
	const struct iomap *iomap = &iomi->iomap;
393
	struct iov_iter *iter = dio->submit.iter;
394 395 396
	void *inline_data = iomap_inline_data(iomap, iomi->pos);
	loff_t length = iomap_length(iomi);
	loff_t pos = iomi->pos;
397 398
	size_t copied;

399 400
	if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap)))
		return -EIO;
401 402

	if (dio->flags & IOMAP_DIO_WRITE) {
403
		loff_t size = iomi->inode->i_size;
404 405

		if (pos > size)
406 407
			memset(iomap_inline_data(iomap, size), 0, pos - size);
		copied = copy_from_iter(inline_data, length, iter);
408 409
		if (copied) {
			if (pos + copied > size)
410 411
				i_size_write(iomi->inode, pos + copied);
			mark_inode_dirty(iomi->inode);
412 413
		}
	} else {
414
		copied = copy_to_iter(inline_data, length, iter);
415 416
	}
	dio->size += copied;
417 418
	if (!copied)
		return -EFAULT;
419 420 421
	return copied;
}

422 423
static loff_t iomap_dio_iter(const struct iomap_iter *iter,
		struct iomap_dio *dio)
424
{
425
	switch (iter->iomap.type) {
426 427 428
	case IOMAP_HOLE:
		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
			return -EIO;
429
		return iomap_dio_hole_iter(iter, dio);
430 431
	case IOMAP_UNWRITTEN:
		if (!(dio->flags & IOMAP_DIO_WRITE))
432 433
			return iomap_dio_hole_iter(iter, dio);
		return iomap_dio_bio_iter(iter, dio);
434
	case IOMAP_MAPPED:
435
		return iomap_dio_bio_iter(iter, dio);
436
	case IOMAP_INLINE:
437
		return iomap_dio_inline_iter(iter, dio);
438 439 440 441
	case IOMAP_DELALLOC:
		/*
		 * DIO is not serialised against mmap() access at all, and so
		 * if the page_mkwrite occurs between the writeback and the
442
		 * iomap_iter() call in the DIO path, then it will see the
443 444 445 446 447
		 * DELALLOC block that the page-mkwrite allocated.
		 */
		pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n",
				    dio->iocb->ki_filp, current->comm);
		return -EIO;
448 449 450 451 452 453 454 455 456 457 458 459 460 461
	default:
		WARN_ON_ONCE(1);
		return -EIO;
	}
}

/*
 * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
 * is being issued as AIO or not.  This allows us to optimise pure data writes
 * to use REQ_FUA rather than requiring generic_write_sync() to issue a
 * REQ_FLUSH post write. This is slightly tricky because a single request here
 * can be mapped into multiple disjoint IOs and only a subset of the IOs issued
 * may be pure data writes. In that case, we still need to do a full data sync
 * completion.
462
 *
463 464 465 466 467 468 469 470
 * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL,
 * __iomap_dio_rw can return a partial result if it encounters a non-resident
 * page in @iter after preparing a transfer.  In that case, the non-resident
 * pages can be faulted in and the request resumed with @done_before set to the
 * number of bytes previously transferred.  The request will then complete with
 * the correct total number of bytes transferred; this is essential for
 * completing partial requests asynchronously.
 *
471 472
 * Returns -ENOTBLK In case of a page invalidation invalidation failure for
 * writes.  The callers needs to fall back to buffered I/O in this case.
473
 */
474 475
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
476
		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
477
		unsigned int dio_flags, size_t done_before)
478 479 480
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	struct inode *inode = file_inode(iocb->ki_filp);
481 482 483 484 485 486 487
	struct iomap_iter iomi = {
		.inode		= inode,
		.pos		= iocb->ki_pos,
		.len		= iov_iter_count(iter),
		.flags		= IOMAP_DIRECT,
	};
	loff_t end = iomi.pos + iomi.len - 1, ret = 0;
488 489
	bool wait_for_completion =
		is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
490 491 492
	struct blk_plug plug;
	struct iomap_dio *dio;

493
	if (!iomi.len)
494
		return NULL;
495 496 497

	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
	if (!dio)
498
		return ERR_PTR(-ENOMEM);
499 500 501 502 503

	dio->iocb = iocb;
	atomic_set(&dio->ref, 1);
	dio->size = 0;
	dio->i_size = i_size_read(inode);
504
	dio->dops = dops;
505 506
	dio->error = 0;
	dio->flags = 0;
507
	dio->done_before = done_before;
508 509 510

	dio->submit.iter = iter;
	dio->submit.waiter = current;
511
	dio->submit.poll_bio = NULL;
512 513

	if (iov_iter_rw(iter) == READ) {
514
		if (iomi.pos >= dio->i_size)
515 516
			goto out_free_dio;

517
		if (iocb->ki_flags & IOCB_NOWAIT) {
518 519
			if (filemap_range_needs_writeback(mapping, iomi.pos,
					end)) {
520 521 522
				ret = -EAGAIN;
				goto out_free_dio;
			}
523
			iomi.flags |= IOMAP_NOWAIT;
524 525
		}

526
		if (iter_is_iovec(iter))
527 528
			dio->flags |= IOMAP_DIO_DIRTY;
	} else {
529
		iomi.flags |= IOMAP_WRITE;
530 531
		dio->flags |= IOMAP_DIO_WRITE;

532
		if (iocb->ki_flags & IOCB_NOWAIT) {
533
			if (filemap_range_has_page(mapping, iomi.pos, end)) {
534 535 536
				ret = -EAGAIN;
				goto out_free_dio;
			}
537
			iomi.flags |= IOMAP_NOWAIT;
538 539
		}

540 541 542 543 544 545 546 547 548 549 550 551 552 553
		/* for data sync or sync, we need sync completion processing */
		if (iocb->ki_flags & IOCB_DSYNC)
			dio->flags |= IOMAP_DIO_NEED_SYNC;

		/*
		 * For datasync only writes, we optimistically try using FUA for
		 * this IO.  Any non-FUA write that occurs will clear this flag,
		 * hence we know before completion whether a cache flush is
		 * necessary.
		 */
		if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
			dio->flags |= IOMAP_DIO_WRITE_FUA;
	}

554 555
	if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
		ret = -EAGAIN;
556 557
		if (iomi.pos >= dio->i_size ||
		    iomi.pos + iomi.len > dio->i_size)
558
			goto out_free_dio;
559
		iomi.flags |= IOMAP_OVERWRITE_ONLY;
560 561
	}

562
	ret = filemap_write_and_wait_range(mapping, iomi.pos, end);
563 564 565
	if (ret)
		goto out_free_dio;

566 567 568
	if (iov_iter_rw(iter) == WRITE) {
		/*
		 * Try to invalidate cache pages for the range we are writing.
569 570
		 * If this invalidation fails, let the caller fall back to
		 * buffered I/O.
571
		 */
572 573 574 575
		if (invalidate_inode_pages2_range(mapping,
				iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) {
			trace_iomap_dio_invalidate_fail(inode, iomi.pos,
							iomi.len);
576 577 578
			ret = -ENOTBLK;
			goto out_free_dio;
		}
579

580 581 582 583 584
		if (!wait_for_completion && !inode->i_sb->s_dio_done_wq) {
			ret = sb_init_dio_done_wq(inode->i_sb);
			if (ret < 0)
				goto out_free_dio;
		}
585 586 587 588 589
	}

	inode_dio_begin(inode);

	blk_start_plug(&plug);
590
	while ((ret = iomap_iter(&iomi, ops)) > 0) {
591
		iomi.processed = iomap_dio_iter(&iomi, dio);
592 593 594 595 596 597 598

		/*
		 * We can only poll for single bio I/Os.
		 */
		iocb->ki_flags &= ~IOCB_HIPRI;
	}

599 600
	blk_finish_plug(&plug);

601 602 603 604 605 606 607 608
	/*
	 * We only report that we've read data up to i_size.
	 * Revert iter to a state corresponding to that as some callers (such
	 * as the splice code) rely on it.
	 */
	if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size)
		iov_iter_revert(iter, iomi.pos - dio->i_size);

609 610 611 612 613 614
	if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) {
		if (!(iocb->ki_flags & IOCB_NOWAIT))
			wait_for_completion = true;
		ret = 0;
	}

615 616 617 618 619
	/* magic error code to fall back to buffered I/O */
	if (ret == -ENOTBLK) {
		wait_for_completion = true;
		ret = 0;
	}
620 621 622 623 624 625 626 627 628 629
	if (ret < 0)
		iomap_dio_set_error(dio, ret);

	/*
	 * If all the writes we issued were FUA, we don't need to flush the
	 * cache on IO completion. Clear the sync flag for this case.
	 */
	if (dio->flags & IOMAP_DIO_WRITE_FUA)
		dio->flags &= ~IOMAP_DIO_NEED_SYNC;

630
	WRITE_ONCE(iocb->private, dio->submit.poll_bio);
631 632 633

	/*
	 * We are about to drop our additional submission reference, which
Y
yangerkun 已提交
634 635
	 * might be the last reference to the dio.  There are three different
	 * ways we can progress here:
636 637 638 639 640 641 642 643 644 645 646 647 648 649
	 *
	 *  (a) If this is the last reference we will always complete and free
	 *	the dio ourselves.
	 *  (b) If this is not the last reference, and we serve an asynchronous
	 *	iocb, we must never touch the dio after the decrement, the
	 *	I/O completion handler will complete and free it.
	 *  (c) If this is not the last reference, but we serve a synchronous
	 *	iocb, the I/O completion handler will wake us up on the drop
	 *	of the final reference, and we will complete and free it here
	 *	after we got woken by the I/O completion handler.
	 */
	dio->wait_for_completion = wait_for_completion;
	if (!atomic_dec_and_test(&dio->ref)) {
		if (!wait_for_completion)
650
			return ERR_PTR(-EIOCBQUEUED);
651 652 653 654 655 656

		for (;;) {
			set_current_state(TASK_UNINTERRUPTIBLE);
			if (!READ_ONCE(dio->submit.waiter))
				break;

657
			if (!dio->submit.poll_bio ||
658
			    !bio_poll(dio->submit.poll_bio, NULL, 0))
659
				blk_io_schedule();
660 661 662 663
		}
		__set_current_state(TASK_RUNNING);
	}

664
	return dio;
665 666 667

out_free_dio:
	kfree(dio);
668 669 670 671 672 673 674 675 676
	if (ret)
		return ERR_PTR(ret);
	return NULL;
}
EXPORT_SYMBOL_GPL(__iomap_dio_rw);

ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
		const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
677
		unsigned int dio_flags, size_t done_before)
678 679 680
{
	struct iomap_dio *dio;

681
	dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
682 683 684
	if (IS_ERR_OR_NULL(dio))
		return PTR_ERR_OR_ZERO(dio);
	return iomap_dio_complete(dio);
685 686
}
EXPORT_SYMBOL_GPL(iomap_dio_rw);