xfs_aops.c 39.2 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3 4
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
L
Linus Torvalds 已提交
5 6
 */
#include "xfs.h"
7
#include "xfs_shared.h"
8 9 10
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
L
Linus Torvalds 已提交
11 12
#include "xfs_mount.h"
#include "xfs_inode.h"
13
#include "xfs_trans.h"
14
#include "xfs_inode_item.h"
15
#include "xfs_alloc.h"
L
Linus Torvalds 已提交
16 17
#include "xfs_error.h"
#include "xfs_iomap.h"
C
Christoph Hellwig 已提交
18
#include "xfs_trace.h"
19
#include "xfs_bmap.h"
D
Dave Chinner 已提交
20
#include "xfs_bmap_util.h"
21
#include "xfs_bmap_btree.h"
22
#include "xfs_reflink.h"
23
#include <linux/gfp.h>
L
Linus Torvalds 已提交
24
#include <linux/mpage.h>
25
#include <linux/pagevec.h>
L
Linus Torvalds 已提交
26 27
#include <linux/writeback.h>

28 29 30 31 32 33 34 35 36
/*
 * structure owned by writepages passed to individual writepage calls
 */
struct xfs_writepage_ctx {
	struct xfs_bmbt_irec    imap;
	unsigned int		io_type;
	struct xfs_ioend	*ioend;
};

C
Christoph Hellwig 已提交
37
void
38 39 40 41 42 43 44
xfs_count_page_state(
	struct page		*page,
	int			*delalloc,
	int			*unwritten)
{
	struct buffer_head	*bh, *head;

45
	*delalloc = *unwritten = 0;
46 47 48

	bh = head = page_buffers(page);
	do {
49
		if (buffer_unwritten(bh))
50 51 52 53 54 55
			(*unwritten) = 1;
		else if (buffer_delay(bh))
			(*delalloc) = 1;
	} while ((bh = bh->b_this_page) != head);
}

56
struct block_device *
C
Christoph Hellwig 已提交
57
xfs_find_bdev_for_inode(
C
Christoph Hellwig 已提交
58
	struct inode		*inode)
C
Christoph Hellwig 已提交
59
{
C
Christoph Hellwig 已提交
60
	struct xfs_inode	*ip = XFS_I(inode);
C
Christoph Hellwig 已提交
61 62
	struct xfs_mount	*mp = ip->i_mount;

63
	if (XFS_IS_REALTIME_INODE(ip))
C
Christoph Hellwig 已提交
64 65 66 67 68
		return mp->m_rtdev_targp->bt_bdev;
	else
		return mp->m_ddev_targp->bt_bdev;
}

69 70 71 72 73 74 75 76 77 78 79 80 81
struct dax_device *
xfs_find_daxdev_for_inode(
	struct inode		*inode)
{
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;

	if (XFS_IS_REALTIME_INODE(ip))
		return mp->m_rtdev_targp->bt_daxdev;
	else
		return mp->m_ddev_targp->bt_daxdev;
}

82 83 84 85 86 87 88 89 90 91 92 93 94
static void
xfs_finish_page_writeback(
	struct inode		*inode,
	struct bio_vec		*bvec,
	int			error)
{
	if (error) {
		SetPageError(bvec->bv_page);
		mapping_set_error(inode->i_mapping, -EIO);
	}
	end_page_writeback(bvec->bv_page);
}

95
/*
96 97 98
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
99
 *
100 101 102 103 104
 * Note that we open code the action in end_buffer_async_write here so that we
 * only have to iterate over the buffers attached to the page once.  This is not
 * only more efficient, but also ensures that we only calls end_page_writeback
 * at the end of the iteration, and thus avoids the pitfall of having the page
 * and buffers potentially freed after every call to end_buffer_async_write.
105 106
 */
static void
107
xfs_finish_buffer_writeback(
108 109 110 111
	struct inode		*inode,
	struct bio_vec		*bvec,
	int			error)
{
112 113
	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
	bool			busy = false;
114
	unsigned int		off = 0;
115
	unsigned long		flags;
116 117

	ASSERT(bvec->bv_offset < PAGE_SIZE);
F
Fabian Frederick 已提交
118
	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
119
	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
F
Fabian Frederick 已提交
120
	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
121

122 123
	local_irq_save(flags);
	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
124
	do {
125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
		if (off >= bvec->bv_offset &&
		    off < bvec->bv_offset + bvec->bv_len) {
			ASSERT(buffer_async_write(bh));
			ASSERT(bh->b_end_io == NULL);

			if (error) {
				mark_buffer_write_io_error(bh);
				clear_buffer_uptodate(bh);
				SetPageError(bvec->bv_page);
			} else {
				set_buffer_uptodate(bh);
			}
			clear_buffer_async_write(bh);
			unlock_buffer(bh);
		} else if (buffer_async_write(bh)) {
			ASSERT(buffer_locked(bh));
			busy = true;
		}
		off += bh->b_size;
	} while ((bh = bh->b_this_page) != head);
	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
	local_irq_restore(flags);

	if (!busy)
		end_page_writeback(bvec->bv_page);
150 151 152 153 154 155
}

/*
 * We're now finished for good with this ioend structure.  Update the page
 * state, release holds on bios, and finally free up memory.  Do not use the
 * ioend after this.
156
 */
157 158
STATIC void
xfs_destroy_ioend(
159 160
	struct xfs_ioend	*ioend,
	int			error)
161
{
162
	struct inode		*inode = ioend->io_inode;
163 164 165 166
	struct bio		*bio = &ioend->io_inline_bio;
	struct bio		*last = ioend->io_bio, *next;
	u64			start = bio->bi_iter.bi_sector;
	bool			quiet = bio_flagged(bio, BIO_QUIET);
167

168
	for (bio = &ioend->io_inline_bio; bio; bio = next) {
169 170 171
		struct bio_vec	*bvec;
		int		i;

172 173 174 175 176 177 178 179
		/*
		 * For the last bio, bi_private points to the ioend, so we
		 * need to explicitly end the iteration here.
		 */
		if (bio == last)
			next = NULL;
		else
			next = bio->bi_private;
C
Christoph Hellwig 已提交
180

181
		/* walk each page on bio, ending page IO on them */
182 183 184 185 186 187
		bio_for_each_segment_all(bvec, bio, i) {
			if (page_has_buffers(bvec->bv_page))
				xfs_finish_buffer_writeback(inode, bvec, error);
			else
				xfs_finish_page_writeback(inode, bvec, error);
		}
188
		bio_put(bio);
189
	}
190 191 192 193 194

	if (unlikely(error && !quiet)) {
		xfs_err_ratelimited(XFS_I(inode)->i_mount,
			"writeback error on sector %llu", start);
	}
195 196
}

C
Christoph Hellwig 已提交
197 198 199 200 201 202 203 204 205
/*
 * Fast and loose check if this write could update the on-disk inode size.
 */
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
{
	return ioend->io_offset + ioend->io_size >
		XFS_I(ioend->io_inode)->i_d.di_size;
}

206 207 208 209 210 211 212 213
STATIC int
xfs_setfilesize_trans_alloc(
	struct xfs_ioend	*ioend)
{
	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
	struct xfs_trans	*tp;
	int			error;

214 215
	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
				XFS_TRANS_NOFS, &tp);
216
	if (error)
217 218 219 220
		return error;

	ioend->io_append_trans = tp;

J
Jan Kara 已提交
221
	/*
222
	 * We may pass freeze protection with a transaction.  So tell lockdep
J
Jan Kara 已提交
223 224
	 * we released it.
	 */
225
	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
226 227 228 229
	/*
	 * We hand off the transaction to the completion thread now, so
	 * clear the flag here.
	 */
230
	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
231 232 233
	return 0;
}

234
/*
235
 * Update on-disk file size now that data has been written to disk.
236
 */
237
STATIC int
238
__xfs_setfilesize(
239 240 241 242
	struct xfs_inode	*ip,
	struct xfs_trans	*tp,
	xfs_off_t		offset,
	size_t			size)
243 244 245
{
	xfs_fsize_t		isize;

246
	xfs_ilock(ip, XFS_ILOCK_EXCL);
247
	isize = xfs_new_eof(ip, offset + size);
248 249
	if (!isize) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
250
		xfs_trans_cancel(tp);
251
		return 0;
252 253
	}

254
	trace_xfs_setfilesize(ip, offset, size);
255 256 257 258 259

	ip->i_d.di_size = isize;
	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

260
	return xfs_trans_commit(tp);
261 262
}

263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279
int
xfs_setfilesize(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	size_t			size)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_trans	*tp;
	int			error;

	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
	if (error)
		return error;

	return __xfs_setfilesize(ip, tp, offset, size);
}

280 281
STATIC int
xfs_setfilesize_ioend(
282 283
	struct xfs_ioend	*ioend,
	int			error)
284 285 286 287 288 289 290 291 292
{
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
	struct xfs_trans	*tp = ioend->io_append_trans;

	/*
	 * The transaction may have been allocated in the I/O submission thread,
	 * thus we need to mark ourselves as being in a transaction manually.
	 * Similarly for freeze protection.
	 */
293
	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
294
	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
295

296
	/* we abort the update if there was an IO error */
297
	if (error) {
298
		xfs_trans_cancel(tp);
299
		return error;
300 301
	}

302
	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
303 304
}

305
/*
306
 * IO write completion.
307 308
 */
STATIC void
309
xfs_end_io(
310
	struct work_struct *work)
311
{
312 313 314
	struct xfs_ioend	*ioend =
		container_of(work, struct xfs_ioend, io_work);
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
315 316
	xfs_off_t		offset = ioend->io_offset;
	size_t			size = ioend->io_size;
317
	int			error;
318

319
	/*
320
	 * Just clean up the in-memory strutures if the fs has been shut down.
321
	 */
322
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
323
		error = -EIO;
324 325
		goto done;
	}
326

327
	/*
328
	 * Clean up any COW blocks on an I/O error.
329
	 */
330
	error = blk_status_to_errno(ioend->io_bio->bi_status);
331 332 333 334 335
	if (unlikely(error)) {
		switch (ioend->io_type) {
		case XFS_IO_COW:
			xfs_reflink_cancel_cow_range(ip, offset, size, true);
			break;
336
		}
337 338

		goto done;
339 340
	}

341
	/*
342
	 * Success:  commit the COW or unwritten blocks if needed.
343
	 */
344 345 346 347 348
	switch (ioend->io_type) {
	case XFS_IO_COW:
		error = xfs_reflink_end_cow(ip, offset, size);
		break;
	case XFS_IO_UNWRITTEN:
349 350
		/* writeback should never update isize */
		error = xfs_iomap_write_unwritten(ip, offset, size, false);
351 352 353 354
		break;
	default:
		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
		break;
355
	}
356

357
done:
358 359
	if (ioend->io_append_trans)
		error = xfs_setfilesize_ioend(ioend, error);
360
	xfs_destroy_ioend(ioend, error);
361 362
}

363 364 365
STATIC void
xfs_end_bio(
	struct bio		*bio)
366
{
367 368
	struct xfs_ioend	*ioend = bio->bi_private;
	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
369

370
	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
371 372 373 374
		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
	else if (ioend->io_append_trans)
		queue_work(mp->m_data_workqueue, &ioend->io_work);
	else
375
		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
376 377
}

L
Linus Torvalds 已提交
378 379
STATIC int
xfs_map_blocks(
C
Christoph Hellwig 已提交
380
	struct xfs_writepage_ctx *wpc,
L
Linus Torvalds 已提交
381
	struct inode		*inode,
C
Christoph Hellwig 已提交
382
	loff_t			offset)
L
Linus Torvalds 已提交
383
{
C
Christoph Hellwig 已提交
384 385
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
F
Fabian Frederick 已提交
386
	ssize_t			count = i_blocksize(inode);
387
	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
C
Christoph Hellwig 已提交
388 389
	struct xfs_bmbt_irec	imap;
	int			whichfork = XFS_DATA_FORK;
390
	struct xfs_iext_cursor	icur;
391
	bool			imap_valid;
C
Christoph Hellwig 已提交
392 393
	int			error = 0;

394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
	/*
	 * We have to make sure the cached mapping is within EOF to protect
	 * against eofblocks trimming on file release leaving us with a stale
	 * mapping. Otherwise, a page for a subsequent file extending buffered
	 * write could get picked up by this writeback cycle and written to the
	 * wrong blocks.
	 *
	 * Note that what we really want here is a generic mapping invalidation
	 * mechanism to protect us from arbitrary extent modifying contexts, not
	 * just eofblocks.
	 */
	xfs_trim_extent_eof(&wpc->imap, ip);

	/*
	 * COW fork blocks can overlap data fork blocks even if the blocks
	 * aren't shared.  COW I/O always takes precedent, so we must always
	 * check for overlap on reflink inodes unless the mapping is already a
	 * COW one.
	 */
	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
	if (imap_valid &&
	    (!xfs_is_reflink_inode(ip) || wpc->io_type == XFS_IO_COW))
		return 0;

C
Christoph Hellwig 已提交
419
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
420
		return -EIO;
C
Christoph Hellwig 已提交
421

422 423 424 425 426 427
	/*
	 * If we don't have a valid map, now it's time to get a new one for this
	 * offset.  This will convert delayed allocations (including COW ones)
	 * into real extents.  If we return without a valid map, it means we
	 * landed in a hole and we skip the block.
	 */
428
	xfs_ilock(ip, XFS_ILOCK_SHARED);
C
Christoph Hellwig 已提交
429 430
	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
	       (ip->i_df.if_flags & XFS_IFEXTENTS));
D
Dave Chinner 已提交
431
	ASSERT(offset <= mp->m_super->s_maxbytes);
C
Christoph Hellwig 已提交
432

433 434 435 436 437 438 439 440
	if (offset > mp->m_super->s_maxbytes - count)
		count = mp->m_super->s_maxbytes - offset;
	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);

	/*
	 * Check if this is offset is covered by a COW extents, and if yes use
	 * it directly instead of looking up anything in the data fork.
	 */
C
Christoph Hellwig 已提交
441
	if (xfs_is_reflink_inode(ip) &&
442 443
	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
	    imap.br_startoff <= offset_fsb) {
C
Christoph Hellwig 已提交
444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
		xfs_iunlock(ip, XFS_ILOCK_SHARED);
		/*
		 * Truncate can race with writeback since writeback doesn't
		 * take the iolock and truncate decreases the file size before
		 * it starts truncating the pages between new_size and old_size.
		 * Therefore, we can end up in the situation where writeback
		 * gets a CoW fork mapping but the truncate makes the mapping
		 * invalid and we end up in here trying to get a new mapping.
		 * bail out here so that we simply never get a valid mapping
		 * and so we drop the write altogether.  The page truncation
		 * will kill the contents anyway.
		 */
		if (offset > i_size_read(inode)) {
			wpc->io_type = XFS_IO_HOLE;
			return 0;
		}
		whichfork = XFS_COW_FORK;
		wpc->io_type = XFS_IO_COW;
		goto allocate_blocks;
	}

	/*
	 * Map valid and no COW extent in the way?  We're done.
	 */
468
	if (imap_valid) {
C
Christoph Hellwig 已提交
469 470 471 472 473 474 475 476 477
		xfs_iunlock(ip, XFS_ILOCK_SHARED);
		return 0;
	}

	/*
	 * If we don't have a valid map, now it's time to get a new one for this
	 * offset.  This will convert delayed allocations (including COW ones)
	 * into real extents.
	 */
478 479
	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
C
Christoph Hellwig 已提交
480
	xfs_iunlock(ip, XFS_ILOCK_SHARED);
C
Christoph Hellwig 已提交
481

482 483 484
	if (imap.br_startoff > offset_fsb) {
		/* landed in a hole or beyond EOF */
		imap.br_blockcount = imap.br_startoff - offset_fsb;
C
Christoph Hellwig 已提交
485 486 487
		imap.br_startoff = offset_fsb;
		imap.br_startblock = HOLESTARTBLOCK;
		wpc->io_type = XFS_IO_HOLE;
488 489 490 491 492 493
	} else {
		if (isnullstartblock(imap.br_startblock)) {
			/* got a delalloc extent */
			wpc->io_type = XFS_IO_DELALLOC;
			goto allocate_blocks;
		}
C
Christoph Hellwig 已提交
494

495 496 497 498
		if (imap.br_state == XFS_EXT_UNWRITTEN)
			wpc->io_type = XFS_IO_UNWRITTEN;
		else
			wpc->io_type = XFS_IO_OVERWRITE;
C
Christoph Hellwig 已提交
499
	}
500

C
Christoph Hellwig 已提交
501 502 503 504 505 506 507 508 509
	wpc->imap = imap;
	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
	return 0;
allocate_blocks:
	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
	if (error)
		return error;
	wpc->imap = imap;
	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
C
Christoph Hellwig 已提交
510
	return 0;
L
Linus Torvalds 已提交
511 512
}

513
/*
514 515 516 517 518 519
 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 * it, and we submit that bio. The ioend may be used for multiple bio
 * submissions, so we only want to allocate an append transaction for the ioend
 * once. In the case of multiple bio submission, each bio will take an IO
 * reference to the ioend to ensure that the ioend completion is only done once
 * all bios have been submitted and the ioend is really done.
520 521 522
 *
 * If @fail is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
523 524 525
 * and unlocked them. In this situation, we need to fail the bio and ioend
 * rather than submit it to IO. This typically only happens on a filesystem
 * shutdown.
526
 */
527
STATIC int
528
xfs_submit_ioend(
529
	struct writeback_control *wbc,
530
	struct xfs_ioend	*ioend,
531
	int			status)
532
{
533 534
	/* Convert CoW extents to regular */
	if (!status && ioend->io_type == XFS_IO_COW) {
535 536 537 538 539 540 541 542 543 544
		/*
		 * Yuk. This can do memory allocation, but is not a
		 * transactional operation so everything is done in GFP_KERNEL
		 * context. That can deadlock, because we hold pages in
		 * writeback state and GFP_KERNEL allocations can block on them.
		 * Hence we must operate in nofs conditions here.
		 */
		unsigned nofs_flag;

		nofs_flag = memalloc_nofs_save();
545 546
		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
				ioend->io_offset, ioend->io_size);
547
		memalloc_nofs_restore(nofs_flag);
548 549
	}

550 551
	/* Reserve log space if we might write beyond the on-disk inode size. */
	if (!status &&
552
	    ioend->io_type != XFS_IO_UNWRITTEN &&
553 554
	    xfs_ioend_is_append(ioend) &&
	    !ioend->io_append_trans)
555
		status = xfs_setfilesize_trans_alloc(ioend);
556

557 558
	ioend->io_bio->bi_private = ioend;
	ioend->io_bio->bi_end_io = xfs_end_bio;
J
Jens Axboe 已提交
559
	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
560

561 562 563 564 565 566 567
	/*
	 * If we are failing the IO now, just mark the ioend with an
	 * error and finish it. This will run IO completion immediately
	 * as there is only one reference to the ioend at this point in
	 * time.
	 */
	if (status) {
568
		ioend->io_bio->bi_status = errno_to_blk_status(status);
569
		bio_endio(ioend->io_bio);
570 571
		return status;
	}
572

573
	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
574
	submit_bio(ioend->io_bio);
575
	return 0;
576 577
}

578 579 580 581 582
static struct xfs_ioend *
xfs_alloc_ioend(
	struct inode		*inode,
	unsigned int		type,
	xfs_off_t		offset,
583 584
	struct block_device	*bdev,
	sector_t		sector)
585 586 587
{
	struct xfs_ioend	*ioend;
	struct bio		*bio;
588

589
	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
590 591
	bio_set_dev(bio, bdev);
	bio->bi_iter.bi_sector = sector;
592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615

	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
	INIT_LIST_HEAD(&ioend->io_list);
	ioend->io_type = type;
	ioend->io_inode = inode;
	ioend->io_size = 0;
	ioend->io_offset = offset;
	INIT_WORK(&ioend->io_work, xfs_end_io);
	ioend->io_append_trans = NULL;
	ioend->io_bio = bio;
	return ioend;
}

/*
 * Allocate a new bio, and chain the old bio to the new one.
 *
 * Note that we have to do perform the chaining in this unintuitive order
 * so that the bi_private linkage is set up in the right direction for the
 * traversal in xfs_destroy_ioend().
 */
static void
xfs_chain_bio(
	struct xfs_ioend	*ioend,
	struct writeback_control *wbc,
616 617
	struct block_device	*bdev,
	sector_t		sector)
618 619 620 621
{
	struct bio *new;

	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
622 623
	bio_set_dev(new, bdev);
	new->bi_iter.bi_sector = sector;
624 625
	bio_chain(ioend->io_bio, new);
	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
J
Jens Axboe 已提交
626
	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
627
	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
628
	submit_bio(ioend->io_bio);
629
	ioend->io_bio = new;
630 631 632
}

/*
633 634
 * Test to see if we have an existing ioend structure that we could append to
 * first, otherwise finish off the current ioend and start another.
635 636 637 638
 */
STATIC void
xfs_add_to_ioend(
	struct inode		*inode,
639
	xfs_off_t		offset,
640
	struct page		*page,
641
	struct xfs_writepage_ctx *wpc,
642
	struct writeback_control *wbc,
643
	struct list_head	*iolist)
644
{
645 646 647 648 649 650 651 652 653 654
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
	unsigned		len = i_blocksize(inode);
	unsigned		poff = offset & (PAGE_SIZE - 1);
	sector_t		sector;

	sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
		((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);

655
	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
656
	    sector != bio_end_sector(wpc->ioend->io_bio) ||
657
	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
658 659
		if (wpc->ioend)
			list_add(&wpc->ioend->io_list, iolist);
660 661
		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
				bdev, sector);
662 663
	}

664
	/*
665 666
	 * If the block doesn't fit into the bio we need to allocate a new
	 * one.  This shouldn't happen more than once for a given block.
667
	 */
668 669
	while (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len)
		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
670

671
	wpc->ioend->io_size += len;
672 673
}

674 675
STATIC void
xfs_map_buffer(
C
Christoph Hellwig 已提交
676
	struct inode		*inode,
677
	struct buffer_head	*bh,
C
Christoph Hellwig 已提交
678
	struct xfs_bmbt_irec	*imap,
C
Christoph Hellwig 已提交
679
	xfs_off_t		offset)
680 681
{
	sector_t		bn;
682
	struct xfs_mount	*m = XFS_I(inode)->i_mount;
C
Christoph Hellwig 已提交
683 684
	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
685

C
Christoph Hellwig 已提交
686 687
	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
688

689
	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
690
	      ((offset - iomap_offset) >> inode->i_blkbits);
691

C
Christoph Hellwig 已提交
692
	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
693 694 695 696 697

	bh->b_blocknr = bn;
	set_buffer_mapped(bh);
}

L
Linus Torvalds 已提交
698 699
STATIC void
xfs_map_at_offset(
C
Christoph Hellwig 已提交
700
	struct inode		*inode,
L
Linus Torvalds 已提交
701
	struct buffer_head	*bh,
C
Christoph Hellwig 已提交
702
	struct xfs_bmbt_irec	*imap,
C
Christoph Hellwig 已提交
703
	xfs_off_t		offset)
L
Linus Torvalds 已提交
704
{
C
Christoph Hellwig 已提交
705 706
	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
L
Linus Torvalds 已提交
707

708
	lock_buffer(bh);
C
Christoph Hellwig 已提交
709
	xfs_map_buffer(inode, bh, imap, offset);
L
Linus Torvalds 已提交
710 711
	set_buffer_mapped(bh);
	clear_buffer_delay(bh);
712
	clear_buffer_unwritten(bh);
713 714 715 716 717 718 719 720

	/*
	 * If this is a realtime file, data may be on a different device.
	 * to that pointed to from the buffer_head b_bdev currently. We can't
	 * trust that the bufferhead has a already been mapped correctly, so
	 * set the bdev now.
	 */
	bh->b_bdev = xfs_find_bdev_for_inode(inode);
721 722 723 724
	bh->b_end_io = NULL;
	set_buffer_async_write(bh);
	set_buffer_uptodate(bh);
	clear_buffer_dirty(bh);
L
Linus Torvalds 已提交
725 726
}

727 728 729
STATIC void
xfs_vm_invalidatepage(
	struct page		*page,
730 731
	unsigned int		offset,
	unsigned int		length)
732
{
733 734
	trace_xfs_invalidatepage(page->mapping->host, page, offset,
				 length);
735 736 737 738 739 740 741 742

	/*
	 * If we are invalidating the entire page, clear the dirty state from it
	 * so that we can check for attempts to release dirty cached pages in
	 * xfs_vm_releasepage().
	 */
	if (offset == 0 && length >= PAGE_SIZE)
		cancel_dirty_page(page);
743
	block_invalidatepage(page, offset, length);
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764
}

/*
 * If the page has delalloc buffers on it, we need to punch them out before we
 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 * is done on that same region - the delalloc extent is returned when none is
 * supposed to be there.
 *
 * We prevent this by truncating away the delalloc regions on the page before
 * invalidating it. Because they are delalloc, we can do this without needing a
 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 * truncation without a transaction as there is no space left for block
 * reservation (typically why we see a ENOSPC in writeback).
 */
STATIC void
xfs_aops_discard_page(
	struct page		*page)
{
	struct inode		*inode = page->mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
765
	struct xfs_mount	*mp = ip->i_mount;
766
	loff_t			offset = page_offset(page);
767 768
	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
	int			error;
769

770
	if (XFS_FORCED_SHUTDOWN(mp))
771 772
		goto out_invalidate;

773
	xfs_alert(mp,
774
		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
775 776
			page, ip->i_ino, offset);

777 778 779 780
	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			PAGE_SIZE / i_blocksize(inode));
	if (error && !XFS_FORCED_SHUTDOWN(mp))
		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
781
out_invalidate:
782
	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
783 784
}

785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800
/*
 * We implement an immediate ioend submission policy here to avoid needing to
 * chain multiple ioends and hence nest mempool allocations which can violate
 * forward progress guarantees we need to provide. The current ioend we are
 * adding buffers to is cached on the writepage context, and if the new buffer
 * does not append to the cached ioend it will create a new ioend and cache that
 * instead.
 *
 * If a new ioend is created and cached, the old ioend is returned and queued
 * locally for submission once the entire page is processed or an error has been
 * detected.  While ioends are submitted immediately after they are completed,
 * batching optimisations are provided by higher level block plugging.
 *
 * At the end of a writeback pass, there will be a cached ioend remaining on the
 * writepage context that the caller will need to submit.
 */
801 802 803
static int
xfs_writepage_map(
	struct xfs_writepage_ctx *wpc,
804
	struct writeback_control *wbc,
805 806
	struct inode		*inode,
	struct page		*page,
807
	uint64_t		end_offset)
808
{
809 810
	LIST_HEAD(submit_list);
	struct xfs_ioend	*ioend, *next;
811
	struct buffer_head	*bh = NULL;
F
Fabian Frederick 已提交
812
	ssize_t			len = i_blocksize(inode);
813
	uint64_t		file_offset;	/* file offset of page */
814
	unsigned		poffset;	/* offset into page */
815 816 817
	int			error = 0;
	int			count = 0;

818 819 820
	if (page_has_buffers(page))
		bh = page_buffers(page);

821 822 823 824 825 826 827
	/*
	 * Walk the blocks on the page, and if we run off the end of the current
	 * map or find the current map invalid, grab a new one.  We only use
	 * bufferheads here to check per-block state - they no longer control
	 * the iteration through the page. This allows us to replace the
	 * bufferhead with some other state tracking mechanism in future.
	 */
828
	for (poffset = 0, file_offset = page_offset(page);
829
	     poffset < PAGE_SIZE;
830
	     poffset += len, file_offset += len) {
831
		/* past the range we are writing, so nothing more to write. */
832
		if (file_offset >= end_offset)
833 834
			break;

835
		if (bh && !buffer_uptodate(bh)) {
836 837
			if (PageUptodate(page))
				ASSERT(buffer_mapped(bh));
838
			bh = bh->b_this_page;
839 840 841
			continue;
		}

842 843 844
		error = xfs_map_blocks(wpc, inode, file_offset);
		if (error)
			break;
845 846 847 848

		if (wpc->io_type == XFS_IO_HOLE) {
			if (bh)
				bh = bh->b_this_page;
C
Christoph Hellwig 已提交
849
			continue;
850
		}
C
Christoph Hellwig 已提交
851

852 853 854 855
		if (bh) {
			xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
			bh = bh->b_this_page;
		}
856 857
		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
				&submit_list);
C
Christoph Hellwig 已提交
858
		count++;
859
	}
860

861
	ASSERT(wpc->ioend || list_empty(&submit_list));
862 863
	ASSERT(PageLocked(page));
	ASSERT(!PageWriteback(page));
864 865

	/*
866 867 868 869 870 871 872 873 874
	 * On error, we have to fail the ioend here because we have locked
	 * buffers in the ioend. If we don't do this, we'll deadlock
	 * invalidating the page as that tries to lock the buffers on the page.
	 * Also, because we may have set pages under writeback, we have to make
	 * sure we run IO completion to mark the error state of the IO
	 * appropriately, so we can't cancel the ioend directly here. That means
	 * we have to mark this page as under writeback if we included any
	 * buffers from it in the ioend chain so that completion treats it
	 * correctly.
875
	 *
876 877 878 879 880
	 * If we didn't include the page in the ioend, the on error we can
	 * simply discard and unlock it as there are no other users of the page
	 * or it's buffers right now. The caller will still need to trigger
	 * submission of outstanding ioends on the writepage context so they are
	 * treated correctly on error.
881
	 */
882 883 884 885 886 887 888 889
	if (unlikely(error)) {
		if (!count) {
			xfs_aops_discard_page(page);
			ClearPageUptodate(page);
			unlock_page(page);
			goto done;
		}

890 891 892 893 894 895 896 897
		/*
		 * If the page was not fully cleaned, we need to ensure that the
		 * higher layers come back to it correctly.  That means we need
		 * to keep the page dirty, and for WB_SYNC_ALL writeback we need
		 * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
		 * so another attempt to write this page in this writeback sweep
		 * will be made.
		 */
898
		set_page_writeback_keepwrite(page);
899
	} else {
900 901
		clear_page_dirty_for_io(page);
		set_page_writeback(page);
902
	}
903

904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926
	unlock_page(page);

	/*
	 * Preserve the original error if there was one, otherwise catch
	 * submission errors here and propagate into subsequent ioend
	 * submissions.
	 */
	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
		int error2;

		list_del_init(&ioend->io_list);
		error2 = xfs_submit_ioend(wbc, ioend, error);
		if (error2 && !error)
			error = error2;
	}

	/*
	 * We can end up here with no error and nothing to write if we race with
	 * a partial page truncate on a sub-page block sized filesystem.
	 */
	if (!count)
		end_page_writeback(page);
done:
927 928 929 930
	mapping_set_error(page->mapping, error);
	return error;
}

L
Linus Torvalds 已提交
931
/*
932 933 934 935 936 937
 * Write out a dirty page.
 *
 * For delalloc space on the page we need to allocate space and flush it.
 * For unwritten space on the page we need to start the conversion to
 * regular allocated space.
 * For any other dirty buffer heads on the page we should flush them.
L
Linus Torvalds 已提交
938 939
 */
STATIC int
940
xfs_do_writepage(
941
	struct page		*page,
942 943
	struct writeback_control *wbc,
	void			*data)
L
Linus Torvalds 已提交
944
{
945
	struct xfs_writepage_ctx *wpc = data;
946
	struct inode		*inode = page->mapping->host;
L
Linus Torvalds 已提交
947
	loff_t			offset;
948
	uint64_t              end_offset;
949
	pgoff_t                 end_index;
950

951
	trace_xfs_writepage(inode, page, 0, 0);
952 953 954 955

	/*
	 * Refuse to write the page out if we are called from reclaim context.
	 *
956 957 958
	 * This avoids stack overflows when called from deeply used stacks in
	 * random callers for direct reclaim or memcg reclaim.  We explicitly
	 * allow reclaim from kswapd as the stack usage there is relatively low.
959
	 *
960 961
	 * This should never happen except in the case of a VM regression so
	 * warn about it.
962
	 */
963 964
	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
			PF_MEMALLOC))
965
		goto redirty;
L
Linus Torvalds 已提交
966

967
	/*
968 969
	 * Given that we do not allow direct reclaim to call us, we should
	 * never be called while in a filesystem transaction.
970
	 */
971
	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
972
		goto redirty;
973

974
	/*
975 976
	 * Is this page beyond the end of the file?
	 *
977 978 979 980 981 982 983 984 985 986
	 * The page index is less than the end_index, adjust the end_offset
	 * to the highest offset that this page should represent.
	 * -----------------------------------------------------
	 * |			file mapping	       | <EOF> |
	 * -----------------------------------------------------
	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
	 * ^--------------------------------^----------|--------
	 * |     desired writeback range    |      see else    |
	 * ---------------------------------^------------------|
	 */
987
	offset = i_size_read(inode);
988
	end_index = offset >> PAGE_SHIFT;
989
	if (page->index < end_index)
990
		end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
991 992 993 994 995 996 997 998 999 1000 1001 1002
	else {
		/*
		 * Check whether the page to write out is beyond or straddles
		 * i_size or not.
		 * -------------------------------------------------------
		 * |		file mapping		        | <EOF>  |
		 * -------------------------------------------------------
		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
		 * ^--------------------------------^-----------|---------
		 * |				    |      Straddles     |
		 * ---------------------------------^-----------|--------|
		 */
1003
		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1004 1005

		/*
1006 1007 1008 1009
		 * Skip the page if it is fully outside i_size, e.g. due to a
		 * truncate operation that is in progress. We must redirty the
		 * page so that reclaim stops reclaiming it. Otherwise
		 * xfs_vm_releasepage() is called on it and gets confused.
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
		 *
		 * Note that the end_index is unsigned long, it would overflow
		 * if the given offset is greater than 16TB on 32-bit system
		 * and if we do check the page is fully outside i_size or not
		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
		 * will be evaluated to 0.  Hence this page will be redirtied
		 * and be written out repeatedly which would result in an
		 * infinite loop, the user program that perform this operation
		 * will hang.  Instead, we can verify this situation by checking
		 * if the page to write is totally beyond the i_size or if it's
		 * offset is just equal to the EOF.
1021
		 */
1022 1023
		if (page->index > end_index ||
		    (page->index == end_index && offset_into_page == 0))
1024
			goto redirty;
1025 1026 1027 1028 1029

		/*
		 * The page straddles i_size.  It must be zeroed out on each
		 * and every writepage invocation because it may be mmapped.
		 * "A file is mapped in multiples of the page size.  For a file
1030
		 * that is not a multiple of the page size, the remaining
1031 1032 1033
		 * memory is zeroed when mapped, and writes to that region are
		 * not written out to the file."
		 */
1034
		zero_user_segment(page, offset_into_page, PAGE_SIZE);
1035 1036 1037

		/* Adjust the end_offset to the end of file */
		end_offset = offset;
L
Linus Torvalds 已提交
1038 1039
	}

1040
	return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
1041

1042
redirty:
1043 1044 1045 1046 1047
	redirty_page_for_writepage(wbc, page);
	unlock_page(page);
	return 0;
}

1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
STATIC int
xfs_vm_writepage(
	struct page		*page,
	struct writeback_control *wbc)
{
	struct xfs_writepage_ctx wpc = {
		.io_type = XFS_IO_INVALID,
	};
	int			ret;

	ret = xfs_do_writepage(page, wbc, &wpc);
1059 1060 1061
	if (wpc.ioend)
		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
	return ret;
1062 1063
}

1064 1065 1066 1067 1068
STATIC int
xfs_vm_writepages(
	struct address_space	*mapping,
	struct writeback_control *wbc)
{
1069 1070 1071 1072 1073
	struct xfs_writepage_ctx wpc = {
		.io_type = XFS_IO_INVALID,
	};
	int			ret;

1074
	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1075
	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1076 1077 1078
	if (wpc.ioend)
		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
	return ret;
1079 1080
}

D
Dan Williams 已提交
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
STATIC int
xfs_dax_writepages(
	struct address_space	*mapping,
	struct writeback_control *wbc)
{
	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
	return dax_writeback_mapping_range(mapping,
			xfs_find_bdev_for_inode(mapping->host), wbc);
}

1091 1092
/*
 * Called to move a page into cleanable state - and from there
1093
 * to be released. The page should already be clean. We always
1094 1095
 * have buffer heads in this call.
 *
1096
 * Returns 1 if the page is ok to release, 0 otherwise.
1097 1098
 */
STATIC int
1099
xfs_vm_releasepage(
1100 1101 1102
	struct page		*page,
	gfp_t			gfp_mask)
{
1103
	int			delalloc, unwritten;
1104

1105
	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1106

1107 1108 1109 1110
	/*
	 * mm accommodates an old ext3 case where clean pages might not have had
	 * the dirty bit cleared. Thus, it can send actual dirty pages to
	 * ->releasepage() via shrink_active_list(). Conversely,
1111 1112
	 * block_invalidatepage() can send pages that are still marked dirty but
	 * otherwise have invalidated buffers.
1113
	 *
1114
	 * We want to release the latter to avoid unnecessary buildup of the
1115 1116 1117 1118 1119 1120 1121
	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
	 * that are entirely invalidated and need to be released.  Hence the
	 * only time we should get dirty pages here is through
	 * shrink_active_list() and so we can simply skip those now.
	 *
	 * warn if we've left any lingering delalloc/unwritten buffers on clean
	 * or invalidated pages we are about to release.
1122
	 */
1123 1124 1125
	if (PageDirty(page))
		return 0;

1126
	xfs_count_page_state(page, &delalloc, &unwritten);
1127

1128
	if (WARN_ON_ONCE(delalloc))
1129
		return 0;
1130
	if (WARN_ON_ONCE(unwritten))
1131 1132 1133 1134 1135
		return 0;

	return try_to_free_buffers(page);
}

1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
/*
 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
 * is, so that we can avoid repeated get_blocks calls.
 *
 * If the mapping spans EOF, then we have to break the mapping up as the mapping
 * for blocks beyond EOF must be marked new so that sub block regions can be
 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
 * was just allocated or is unwritten, otherwise the callers would overwrite
 * existing data with zeros. Hence we have to split the mapping into a range up
 * to and including EOF, and a second mapping for beyond EOF.
 */
static void
xfs_map_trim_size(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	struct xfs_bmbt_irec	*imap,
	xfs_off_t		offset,
	ssize_t			size)
{
	xfs_off_t		mapping_size;

	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
	mapping_size <<= inode->i_blkbits;

	ASSERT(mapping_size > 0);
	if (mapping_size > size)
		mapping_size = size;
	if (offset < i_size_read(inode) &&
D
Darrick J. Wong 已提交
1165
	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
1166 1167
		/* limit mapping to block that spans EOF */
		mapping_size = roundup_64(i_size_read(inode) - offset,
F
Fabian Frederick 已提交
1168
					  i_blocksize(inode));
1169 1170 1171 1172 1173 1174 1175
	}
	if (mapping_size > LONG_MAX)
		mapping_size = LONG_MAX;

	bh_result->b_size = mapping_size;
}

1176
static int
C
Christoph Hellwig 已提交
1177
xfs_get_blocks(
L
Linus Torvalds 已提交
1178 1179 1180
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
C
Christoph Hellwig 已提交
1181
	int			create)
L
Linus Torvalds 已提交
1182
{
C
Christoph Hellwig 已提交
1183 1184 1185 1186 1187
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb, end_fsb;
	int			error = 0;
	int			lockmode = 0;
C
Christoph Hellwig 已提交
1188
	struct xfs_bmbt_irec	imap;
C
Christoph Hellwig 已提交
1189
	int			nimaps = 1;
1190 1191
	xfs_off_t		offset;
	ssize_t			size;
C
Christoph Hellwig 已提交
1192

C
Christoph Hellwig 已提交
1193
	BUG_ON(create);
1194

C
Christoph Hellwig 已提交
1195
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
1196
		return -EIO;
L
Linus Torvalds 已提交
1197

1198
	offset = (xfs_off_t)iblock << inode->i_blkbits;
F
Fabian Frederick 已提交
1199
	ASSERT(bh_result->b_size >= i_blocksize(inode));
1200
	size = bh_result->b_size;
1201

C
Christoph Hellwig 已提交
1202
	if (offset >= i_size_read(inode))
1203 1204
		return 0;

1205 1206
	/*
	 * Direct I/O is usually done on preallocated files, so try getting
1207
	 * a block mapping without an exclusive lock first.
1208
	 */
1209
	lockmode = xfs_ilock_data_map_shared(ip);
1210

D
Dave Chinner 已提交
1211
	ASSERT(offset <= mp->m_super->s_maxbytes);
1212
	if (offset > mp->m_super->s_maxbytes - size)
D
Dave Chinner 已提交
1213
		size = mp->m_super->s_maxbytes - offset;
C
Christoph Hellwig 已提交
1214 1215 1216
	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
	offset_fsb = XFS_B_TO_FSBT(mp, offset);

1217 1218
	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
			&nimaps, 0);
L
Linus Torvalds 已提交
1219
	if (error)
C
Christoph Hellwig 已提交
1220
		goto out_unlock;
1221
	if (!nimaps) {
C
Christoph Hellwig 已提交
1222 1223 1224
		trace_xfs_get_blocks_notfound(ip, offset, size);
		goto out_unlock;
	}
L
Linus Torvalds 已提交
1225

1226 1227 1228 1229 1230
	trace_xfs_get_blocks_found(ip, offset, size,
		imap.br_state == XFS_EXT_UNWRITTEN ?
			XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
	xfs_iunlock(ip, lockmode);

1231
	/* trim mapping down to size requested */
1232
	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1233

1234 1235 1236 1237
	/*
	 * For unwritten extents do not report a disk address in the buffered
	 * read case (treat as if we're reading into a hole).
	 */
1238
	if (xfs_bmap_is_real_extent(&imap))
1239
		xfs_map_buffer(inode, bh_result, &imap, offset);
L
Linus Torvalds 已提交
1240

1241 1242 1243 1244
	/*
	 * If this is a realtime file, data may be on a different device.
	 * to that pointed to from the buffer_head b_bdev currently.
	 */
C
Christoph Hellwig 已提交
1245
	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
L
Linus Torvalds 已提交
1246
	return 0;
C
Christoph Hellwig 已提交
1247 1248 1249

out_unlock:
	xfs_iunlock(ip, lockmode);
D
Dave Chinner 已提交
1250
	return error;
L
Linus Torvalds 已提交
1251 1252 1253
}

STATIC sector_t
1254
xfs_vm_bmap(
L
Linus Torvalds 已提交
1255 1256 1257
	struct address_space	*mapping,
	sector_t		block)
{
C
Christoph Hellwig 已提交
1258
	struct xfs_inode	*ip = XFS_I(mapping->host);
L
Linus Torvalds 已提交
1259

C
Christoph Hellwig 已提交
1260
	trace_xfs_vm_bmap(ip);
1261 1262 1263

	/*
	 * The swap code (ab-)uses ->bmap to get a block mapping and then
1264
	 * bypasses the file system for actual I/O.  We really can't allow
1265
	 * that on reflinks inodes, so we have to skip out here.  And yes,
1266 1267 1268 1269
	 * 0 is the magic code for a bmap error.
	 *
	 * Since we don't pass back blockdev info, we can't return bmap
	 * information for rt files either.
1270
	 */
1271
	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
1272
		return 0;
C
Christoph Hellwig 已提交
1273
	return iomap_bmap(mapping, block, &xfs_iomap_ops);
L
Linus Torvalds 已提交
1274 1275 1276
}

STATIC int
1277
xfs_vm_readpage(
L
Linus Torvalds 已提交
1278 1279 1280
	struct file		*unused,
	struct page		*page)
{
1281
	trace_xfs_vm_readpage(page->mapping->host, 1);
1282 1283
	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
		return iomap_readpage(page, &xfs_iomap_ops);
1284
	return mpage_readpage(page, xfs_get_blocks);
L
Linus Torvalds 已提交
1285 1286 1287
}

STATIC int
1288
xfs_vm_readpages(
L
Linus Torvalds 已提交
1289 1290 1291 1292 1293
	struct file		*unused,
	struct address_space	*mapping,
	struct list_head	*pages,
	unsigned		nr_pages)
{
1294
	trace_xfs_vm_readpages(mapping->host, nr_pages);
1295 1296
	if (i_blocksize(mapping->host) == PAGE_SIZE)
		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
1297
	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
L
Linus Torvalds 已提交
1298 1299
}

1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
/*
 * This is basically a copy of __set_page_dirty_buffers() with one
 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
 * dirty, we'll never be able to clean them because we don't write buffers
 * beyond EOF, and that means we can't invalidate pages that span EOF
 * that have been marked dirty. Further, the dirty state can leak into
 * the file interior if the file is extended, resulting in all sorts of
 * bad things happening as the state does not match the underlying data.
 *
 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
 * this only exist because of bufferheads and how the generic code manages them.
 */
STATIC int
xfs_vm_set_page_dirty(
	struct page		*page)
{
	struct address_space	*mapping = page->mapping;
	struct inode		*inode = mapping->host;
	loff_t			end_offset;
	loff_t			offset;
	int			newly_dirty;

	if (unlikely(!mapping))
		return !TestSetPageDirty(page);

	end_offset = i_size_read(inode);
	offset = page_offset(page);

	spin_lock(&mapping->private_lock);
	if (page_has_buffers(page)) {
		struct buffer_head *head = page_buffers(page);
		struct buffer_head *bh = head;

		do {
			if (offset < end_offset)
				set_buffer_dirty(bh);
			bh = bh->b_this_page;
F
Fabian Frederick 已提交
1337
			offset += i_blocksize(inode);
1338 1339
		} while (bh != head);
	}
1340
	/*
1341 1342
	 * Lock out page->mem_cgroup migration to keep PageDirty
	 * synchronized with per-memcg dirty page counters.
1343
	 */
J
Johannes Weiner 已提交
1344
	lock_page_memcg(page);
1345 1346 1347
	newly_dirty = !TestSetPageDirty(page);
	spin_unlock(&mapping->private_lock);

M
Matthew Wilcox 已提交
1348 1349
	if (newly_dirty)
		__set_page_dirty(page, mapping, 1);
J
Johannes Weiner 已提交
1350
	unlock_page_memcg(page);
1351 1352
	if (newly_dirty)
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1353 1354 1355
	return newly_dirty;
}

1356 1357 1358 1359 1360 1361 1362 1363 1364 1365
static int
xfs_iomap_swapfile_activate(
	struct swap_info_struct		*sis,
	struct file			*swap_file,
	sector_t			*span)
{
	sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
	return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
}

1366
const struct address_space_operations xfs_address_space_operations = {
1367 1368 1369
	.readpage		= xfs_vm_readpage,
	.readpages		= xfs_vm_readpages,
	.writepage		= xfs_vm_writepage,
1370
	.writepages		= xfs_vm_writepages,
1371
	.set_page_dirty		= xfs_vm_set_page_dirty,
1372 1373
	.releasepage		= xfs_vm_releasepage,
	.invalidatepage		= xfs_vm_invalidatepage,
1374
	.bmap			= xfs_vm_bmap,
D
Dan Williams 已提交
1375
	.direct_IO		= noop_direct_IO,
1376
	.migratepage		= buffer_migrate_page,
1377
	.is_partially_uptodate  = block_is_partially_uptodate,
1378
	.error_remove_page	= generic_error_remove_page,
1379
	.swap_activate		= xfs_iomap_swapfile_activate,
L
Linus Torvalds 已提交
1380
};
D
Dan Williams 已提交
1381 1382 1383 1384 1385 1386

const struct address_space_operations xfs_dax_aops = {
	.writepages		= xfs_dax_writepages,
	.direct_IO		= noop_direct_IO,
	.set_page_dirty		= noop_set_page_dirty,
	.invalidatepage		= noop_invalidatepage,
1387
	.swap_activate		= xfs_iomap_swapfile_activate,
D
Dan Williams 已提交
1388
};