xfs_aops.c 39.0 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3 4
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
L
Linus Torvalds 已提交
5 6
 */
#include "xfs.h"
7
#include "xfs_shared.h"
8 9 10
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
L
Linus Torvalds 已提交
11 12
#include "xfs_mount.h"
#include "xfs_inode.h"
13
#include "xfs_trans.h"
14
#include "xfs_inode_item.h"
15
#include "xfs_alloc.h"
L
Linus Torvalds 已提交
16 17
#include "xfs_error.h"
#include "xfs_iomap.h"
C
Christoph Hellwig 已提交
18
#include "xfs_trace.h"
19
#include "xfs_bmap.h"
D
Dave Chinner 已提交
20
#include "xfs_bmap_util.h"
21
#include "xfs_bmap_btree.h"
22
#include "xfs_reflink.h"
23
#include <linux/gfp.h>
L
Linus Torvalds 已提交
24
#include <linux/mpage.h>
25
#include <linux/pagevec.h>
L
Linus Torvalds 已提交
26 27
#include <linux/writeback.h>

28 29 30 31 32 33 34 35 36
/*
 * structure owned by writepages passed to individual writepage calls
 */
struct xfs_writepage_ctx {
	struct xfs_bmbt_irec    imap;
	unsigned int		io_type;
	struct xfs_ioend	*ioend;
};

C
Christoph Hellwig 已提交
37
void
38 39 40 41 42 43 44
xfs_count_page_state(
	struct page		*page,
	int			*delalloc,
	int			*unwritten)
{
	struct buffer_head	*bh, *head;

45
	*delalloc = *unwritten = 0;
46 47 48

	bh = head = page_buffers(page);
	do {
49
		if (buffer_unwritten(bh))
50 51 52 53 54 55
			(*unwritten) = 1;
		else if (buffer_delay(bh))
			(*delalloc) = 1;
	} while ((bh = bh->b_this_page) != head);
}

56
struct block_device *
C
Christoph Hellwig 已提交
57
xfs_find_bdev_for_inode(
C
Christoph Hellwig 已提交
58
	struct inode		*inode)
C
Christoph Hellwig 已提交
59
{
C
Christoph Hellwig 已提交
60
	struct xfs_inode	*ip = XFS_I(inode);
C
Christoph Hellwig 已提交
61 62
	struct xfs_mount	*mp = ip->i_mount;

63
	if (XFS_IS_REALTIME_INODE(ip))
C
Christoph Hellwig 已提交
64 65 66 67 68
		return mp->m_rtdev_targp->bt_bdev;
	else
		return mp->m_ddev_targp->bt_bdev;
}

69 70 71 72 73 74 75 76 77 78 79 80 81
struct dax_device *
xfs_find_daxdev_for_inode(
	struct inode		*inode)
{
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;

	if (XFS_IS_REALTIME_INODE(ip))
		return mp->m_rtdev_targp->bt_daxdev;
	else
		return mp->m_ddev_targp->bt_daxdev;
}

82
/*
83 84 85
 * We're now finished for good with this page.  Update the page state via the
 * associated buffer_heads, paying attention to the start and end offsets that
 * we need to process on the page.
86
 *
87 88 89 90 91
 * Note that we open code the action in end_buffer_async_write here so that we
 * only have to iterate over the buffers attached to the page once.  This is not
 * only more efficient, but also ensures that we only calls end_page_writeback
 * at the end of the iteration, and thus avoids the pitfall of having the page
 * and buffers potentially freed after every call to end_buffer_async_write.
92 93 94 95 96 97 98
 */
static void
xfs_finish_page_writeback(
	struct inode		*inode,
	struct bio_vec		*bvec,
	int			error)
{
99 100
	struct buffer_head	*head = page_buffers(bvec->bv_page), *bh = head;
	bool			busy = false;
101
	unsigned int		off = 0;
102
	unsigned long		flags;
103 104

	ASSERT(bvec->bv_offset < PAGE_SIZE);
F
Fabian Frederick 已提交
105
	ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
106
	ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
F
Fabian Frederick 已提交
107
	ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
108

109 110
	local_irq_save(flags);
	bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
111
	do {
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
		if (off >= bvec->bv_offset &&
		    off < bvec->bv_offset + bvec->bv_len) {
			ASSERT(buffer_async_write(bh));
			ASSERT(bh->b_end_io == NULL);

			if (error) {
				mark_buffer_write_io_error(bh);
				clear_buffer_uptodate(bh);
				SetPageError(bvec->bv_page);
			} else {
				set_buffer_uptodate(bh);
			}
			clear_buffer_async_write(bh);
			unlock_buffer(bh);
		} else if (buffer_async_write(bh)) {
			ASSERT(buffer_locked(bh));
			busy = true;
		}
		off += bh->b_size;
	} while ((bh = bh->b_this_page) != head);
	bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
	local_irq_restore(flags);

	if (!busy)
		end_page_writeback(bvec->bv_page);
137 138 139 140 141 142
}

/*
 * We're now finished for good with this ioend structure.  Update the page
 * state, release holds on bios, and finally free up memory.  Do not use the
 * ioend after this.
143
 */
144 145
STATIC void
xfs_destroy_ioend(
146 147
	struct xfs_ioend	*ioend,
	int			error)
148
{
149
	struct inode		*inode = ioend->io_inode;
150 151 152 153
	struct bio		*bio = &ioend->io_inline_bio;
	struct bio		*last = ioend->io_bio, *next;
	u64			start = bio->bi_iter.bi_sector;
	bool			quiet = bio_flagged(bio, BIO_QUIET);
154

155
	for (bio = &ioend->io_inline_bio; bio; bio = next) {
156 157 158
		struct bio_vec	*bvec;
		int		i;

159 160 161 162 163 164 165 166
		/*
		 * For the last bio, bi_private points to the ioend, so we
		 * need to explicitly end the iteration here.
		 */
		if (bio == last)
			next = NULL;
		else
			next = bio->bi_private;
C
Christoph Hellwig 已提交
167

168 169 170 171 172
		/* walk each page on bio, ending page IO on them */
		bio_for_each_segment_all(bvec, bio, i)
			xfs_finish_page_writeback(inode, bvec, error);

		bio_put(bio);
173
	}
174 175 176 177 178

	if (unlikely(error && !quiet)) {
		xfs_err_ratelimited(XFS_I(inode)->i_mount,
			"writeback error on sector %llu", start);
	}
179 180
}

C
Christoph Hellwig 已提交
181 182 183 184 185 186 187 188 189
/*
 * Fast and loose check if this write could update the on-disk inode size.
 */
static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
{
	return ioend->io_offset + ioend->io_size >
		XFS_I(ioend->io_inode)->i_d.di_size;
}

190 191 192 193 194 195 196 197
STATIC int
xfs_setfilesize_trans_alloc(
	struct xfs_ioend	*ioend)
{
	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
	struct xfs_trans	*tp;
	int			error;

198 199
	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
				XFS_TRANS_NOFS, &tp);
200
	if (error)
201 202 203 204
		return error;

	ioend->io_append_trans = tp;

J
Jan Kara 已提交
205
	/*
206
	 * We may pass freeze protection with a transaction.  So tell lockdep
J
Jan Kara 已提交
207 208
	 * we released it.
	 */
209
	__sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
210 211 212 213
	/*
	 * We hand off the transaction to the completion thread now, so
	 * clear the flag here.
	 */
214
	current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
215 216 217
	return 0;
}

218
/*
219
 * Update on-disk file size now that data has been written to disk.
220
 */
221
STATIC int
222
__xfs_setfilesize(
223 224 225 226
	struct xfs_inode	*ip,
	struct xfs_trans	*tp,
	xfs_off_t		offset,
	size_t			size)
227 228 229
{
	xfs_fsize_t		isize;

230
	xfs_ilock(ip, XFS_ILOCK_EXCL);
231
	isize = xfs_new_eof(ip, offset + size);
232 233
	if (!isize) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
234
		xfs_trans_cancel(tp);
235
		return 0;
236 237
	}

238
	trace_xfs_setfilesize(ip, offset, size);
239 240 241 242 243

	ip->i_d.di_size = isize;
	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);

244
	return xfs_trans_commit(tp);
245 246
}

247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263
int
xfs_setfilesize(
	struct xfs_inode	*ip,
	xfs_off_t		offset,
	size_t			size)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_trans	*tp;
	int			error;

	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
	if (error)
		return error;

	return __xfs_setfilesize(ip, tp, offset, size);
}

264 265
STATIC int
xfs_setfilesize_ioend(
266 267
	struct xfs_ioend	*ioend,
	int			error)
268 269 270 271 272 273 274 275 276
{
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
	struct xfs_trans	*tp = ioend->io_append_trans;

	/*
	 * The transaction may have been allocated in the I/O submission thread,
	 * thus we need to mark ourselves as being in a transaction manually.
	 * Similarly for freeze protection.
	 */
277
	current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
278
	__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
279

280
	/* we abort the update if there was an IO error */
281
	if (error) {
282
		xfs_trans_cancel(tp);
283
		return error;
284 285
	}

286
	return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
287 288
}

289
/*
290
 * IO write completion.
291 292
 */
STATIC void
293
xfs_end_io(
294
	struct work_struct *work)
295
{
296 297 298
	struct xfs_ioend	*ioend =
		container_of(work, struct xfs_ioend, io_work);
	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
299 300
	xfs_off_t		offset = ioend->io_offset;
	size_t			size = ioend->io_size;
301
	int			error;
302

303
	/*
304
	 * Just clean up the in-memory strutures if the fs has been shut down.
305
	 */
306
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
307
		error = -EIO;
308 309
		goto done;
	}
310

311
	/*
312
	 * Clean up any COW blocks on an I/O error.
313
	 */
314
	error = blk_status_to_errno(ioend->io_bio->bi_status);
315 316 317 318 319
	if (unlikely(error)) {
		switch (ioend->io_type) {
		case XFS_IO_COW:
			xfs_reflink_cancel_cow_range(ip, offset, size, true);
			break;
320
		}
321 322

		goto done;
323 324
	}

325
	/*
326
	 * Success:  commit the COW or unwritten blocks if needed.
327
	 */
328 329 330 331 332
	switch (ioend->io_type) {
	case XFS_IO_COW:
		error = xfs_reflink_end_cow(ip, offset, size);
		break;
	case XFS_IO_UNWRITTEN:
333 334
		/* writeback should never update isize */
		error = xfs_iomap_write_unwritten(ip, offset, size, false);
335 336 337 338
		break;
	default:
		ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
		break;
339
	}
340

341
done:
342 343
	if (ioend->io_append_trans)
		error = xfs_setfilesize_ioend(ioend, error);
344
	xfs_destroy_ioend(ioend, error);
345 346
}

347 348 349
STATIC void
xfs_end_bio(
	struct bio		*bio)
350
{
351 352
	struct xfs_ioend	*ioend = bio->bi_private;
	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
353

354
	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
355 356 357 358
		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
	else if (ioend->io_append_trans)
		queue_work(mp->m_data_workqueue, &ioend->io_work);
	else
359
		xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
360 361
}

L
Linus Torvalds 已提交
362 363
STATIC int
xfs_map_blocks(
C
Christoph Hellwig 已提交
364
	struct xfs_writepage_ctx *wpc,
L
Linus Torvalds 已提交
365
	struct inode		*inode,
C
Christoph Hellwig 已提交
366
	loff_t			offset)
L
Linus Torvalds 已提交
367
{
C
Christoph Hellwig 已提交
368 369
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
F
Fabian Frederick 已提交
370
	ssize_t			count = i_blocksize(inode);
371
	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
C
Christoph Hellwig 已提交
372 373
	struct xfs_bmbt_irec	imap;
	int			whichfork = XFS_DATA_FORK;
374
	struct xfs_iext_cursor	icur;
375
	bool			imap_valid;
C
Christoph Hellwig 已提交
376 377
	int			error = 0;

378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
	/*
	 * We have to make sure the cached mapping is within EOF to protect
	 * against eofblocks trimming on file release leaving us with a stale
	 * mapping. Otherwise, a page for a subsequent file extending buffered
	 * write could get picked up by this writeback cycle and written to the
	 * wrong blocks.
	 *
	 * Note that what we really want here is a generic mapping invalidation
	 * mechanism to protect us from arbitrary extent modifying contexts, not
	 * just eofblocks.
	 */
	xfs_trim_extent_eof(&wpc->imap, ip);

	/*
	 * COW fork blocks can overlap data fork blocks even if the blocks
	 * aren't shared.  COW I/O always takes precedent, so we must always
	 * check for overlap on reflink inodes unless the mapping is already a
	 * COW one.
	 */
	imap_valid = offset_fsb >= wpc->imap.br_startoff &&
		     offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
	if (imap_valid &&
	    (!xfs_is_reflink_inode(ip) || wpc->io_type == XFS_IO_COW))
		return 0;

C
Christoph Hellwig 已提交
403
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
404
		return -EIO;
C
Christoph Hellwig 已提交
405

406 407 408 409 410 411
	/*
	 * If we don't have a valid map, now it's time to get a new one for this
	 * offset.  This will convert delayed allocations (including COW ones)
	 * into real extents.  If we return without a valid map, it means we
	 * landed in a hole and we skip the block.
	 */
412
	xfs_ilock(ip, XFS_ILOCK_SHARED);
C
Christoph Hellwig 已提交
413 414
	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
	       (ip->i_df.if_flags & XFS_IFEXTENTS));
D
Dave Chinner 已提交
415
	ASSERT(offset <= mp->m_super->s_maxbytes);
C
Christoph Hellwig 已提交
416

417 418 419 420 421 422 423 424
	if (offset > mp->m_super->s_maxbytes - count)
		count = mp->m_super->s_maxbytes - offset;
	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);

	/*
	 * Check if this is offset is covered by a COW extents, and if yes use
	 * it directly instead of looking up anything in the data fork.
	 */
C
Christoph Hellwig 已提交
425
	if (xfs_is_reflink_inode(ip) &&
426 427
	    xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
	    imap.br_startoff <= offset_fsb) {
C
Christoph Hellwig 已提交
428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
		xfs_iunlock(ip, XFS_ILOCK_SHARED);
		/*
		 * Truncate can race with writeback since writeback doesn't
		 * take the iolock and truncate decreases the file size before
		 * it starts truncating the pages between new_size and old_size.
		 * Therefore, we can end up in the situation where writeback
		 * gets a CoW fork mapping but the truncate makes the mapping
		 * invalid and we end up in here trying to get a new mapping.
		 * bail out here so that we simply never get a valid mapping
		 * and so we drop the write altogether.  The page truncation
		 * will kill the contents anyway.
		 */
		if (offset > i_size_read(inode)) {
			wpc->io_type = XFS_IO_HOLE;
			return 0;
		}
		whichfork = XFS_COW_FORK;
		wpc->io_type = XFS_IO_COW;
		goto allocate_blocks;
	}

	/*
	 * Map valid and no COW extent in the way?  We're done.
	 */
452
	if (imap_valid) {
C
Christoph Hellwig 已提交
453 454 455 456 457 458 459 460 461
		xfs_iunlock(ip, XFS_ILOCK_SHARED);
		return 0;
	}

	/*
	 * If we don't have a valid map, now it's time to get a new one for this
	 * offset.  This will convert delayed allocations (including COW ones)
	 * into real extents.
	 */
462 463
	if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
		imap.br_startoff = end_fsb;	/* fake a hole past EOF */
C
Christoph Hellwig 已提交
464
	xfs_iunlock(ip, XFS_ILOCK_SHARED);
C
Christoph Hellwig 已提交
465

466 467 468
	if (imap.br_startoff > offset_fsb) {
		/* landed in a hole or beyond EOF */
		imap.br_blockcount = imap.br_startoff - offset_fsb;
C
Christoph Hellwig 已提交
469 470 471
		imap.br_startoff = offset_fsb;
		imap.br_startblock = HOLESTARTBLOCK;
		wpc->io_type = XFS_IO_HOLE;
472 473 474 475 476 477
	} else {
		if (isnullstartblock(imap.br_startblock)) {
			/* got a delalloc extent */
			wpc->io_type = XFS_IO_DELALLOC;
			goto allocate_blocks;
		}
C
Christoph Hellwig 已提交
478

479 480 481 482
		if (imap.br_state == XFS_EXT_UNWRITTEN)
			wpc->io_type = XFS_IO_UNWRITTEN;
		else
			wpc->io_type = XFS_IO_OVERWRITE;
C
Christoph Hellwig 已提交
483
	}
484

C
Christoph Hellwig 已提交
485 486 487 488 489 490 491 492 493
	wpc->imap = imap;
	trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
	return 0;
allocate_blocks:
	error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
	if (error)
		return error;
	wpc->imap = imap;
	trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
C
Christoph Hellwig 已提交
494
	return 0;
L
Linus Torvalds 已提交
495 496
}

497 498 499
STATIC void
xfs_start_page_writeback(
	struct page		*page,
500
	int			clear_dirty)
501 502 503
{
	ASSERT(PageLocked(page));
	ASSERT(!PageWriteback(page));
504 505 506 507 508 509 510 511 512

	/*
	 * if the page was not fully cleaned, we need to ensure that the higher
	 * layers come back to it correctly. That means we need to keep the page
	 * dirty, and for WB_SYNC_ALL writeback we need to ensure the
	 * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
	 * write this page in this writeback sweep will be made.
	 */
	if (clear_dirty) {
513
		clear_page_dirty_for_io(page);
514 515 516 517
		set_page_writeback(page);
	} else
		set_page_writeback_keepwrite(page);

518 519 520 521
	unlock_page(page);
}

/*
522 523 524 525 526 527
 * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 * it, and we submit that bio. The ioend may be used for multiple bio
 * submissions, so we only want to allocate an append transaction for the ioend
 * once. In the case of multiple bio submission, each bio will take an IO
 * reference to the ioend to ensure that the ioend completion is only done once
 * all bios have been submitted and the ioend is really done.
528 529 530
 *
 * If @fail is non-zero, it means that we have a situation where some part of
 * the submission process has failed after we have marked paged for writeback
531 532 533
 * and unlocked them. In this situation, we need to fail the bio and ioend
 * rather than submit it to IO. This typically only happens on a filesystem
 * shutdown.
534
 */
535
STATIC int
536
xfs_submit_ioend(
537
	struct writeback_control *wbc,
538
	struct xfs_ioend	*ioend,
539
	int			status)
540
{
541 542
	/* Convert CoW extents to regular */
	if (!status && ioend->io_type == XFS_IO_COW) {
543 544 545 546 547 548 549 550 551 552
		/*
		 * Yuk. This can do memory allocation, but is not a
		 * transactional operation so everything is done in GFP_KERNEL
		 * context. That can deadlock, because we hold pages in
		 * writeback state and GFP_KERNEL allocations can block on them.
		 * Hence we must operate in nofs conditions here.
		 */
		unsigned nofs_flag;

		nofs_flag = memalloc_nofs_save();
553 554
		status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
				ioend->io_offset, ioend->io_size);
555
		memalloc_nofs_restore(nofs_flag);
556 557
	}

558 559
	/* Reserve log space if we might write beyond the on-disk inode size. */
	if (!status &&
560
	    ioend->io_type != XFS_IO_UNWRITTEN &&
561 562
	    xfs_ioend_is_append(ioend) &&
	    !ioend->io_append_trans)
563
		status = xfs_setfilesize_trans_alloc(ioend);
564

565 566
	ioend->io_bio->bi_private = ioend;
	ioend->io_bio->bi_end_io = xfs_end_bio;
J
Jens Axboe 已提交
567
	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
568

569 570 571 572 573 574 575
	/*
	 * If we are failing the IO now, just mark the ioend with an
	 * error and finish it. This will run IO completion immediately
	 * as there is only one reference to the ioend at this point in
	 * time.
	 */
	if (status) {
576
		ioend->io_bio->bi_status = errno_to_blk_status(status);
577
		bio_endio(ioend->io_bio);
578 579
		return status;
	}
580

581
	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
582
	submit_bio(ioend->io_bio);
583
	return 0;
584 585
}

586 587 588 589 590
static struct xfs_ioend *
xfs_alloc_ioend(
	struct inode		*inode,
	unsigned int		type,
	xfs_off_t		offset,
591 592
	struct block_device	*bdev,
	sector_t		sector)
593 594 595
{
	struct xfs_ioend	*ioend;
	struct bio		*bio;
596

597
	bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
598 599
	bio_set_dev(bio, bdev);
	bio->bi_iter.bi_sector = sector;
600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623

	ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
	INIT_LIST_HEAD(&ioend->io_list);
	ioend->io_type = type;
	ioend->io_inode = inode;
	ioend->io_size = 0;
	ioend->io_offset = offset;
	INIT_WORK(&ioend->io_work, xfs_end_io);
	ioend->io_append_trans = NULL;
	ioend->io_bio = bio;
	return ioend;
}

/*
 * Allocate a new bio, and chain the old bio to the new one.
 *
 * Note that we have to do perform the chaining in this unintuitive order
 * so that the bi_private linkage is set up in the right direction for the
 * traversal in xfs_destroy_ioend().
 */
static void
xfs_chain_bio(
	struct xfs_ioend	*ioend,
	struct writeback_control *wbc,
624 625
	struct block_device	*bdev,
	sector_t		sector)
626 627 628 629
{
	struct bio *new;

	new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
630 631
	bio_set_dev(new, bdev);
	new->bi_iter.bi_sector = sector;
632 633
	bio_chain(ioend->io_bio, new);
	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
J
Jens Axboe 已提交
634
	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
635
	ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
636
	submit_bio(ioend->io_bio);
637
	ioend->io_bio = new;
638 639 640
}

/*
641 642
 * Test to see if we have an existing ioend structure that we could append to
 * first, otherwise finish off the current ioend and start another.
643 644 645 646
 */
STATIC void
xfs_add_to_ioend(
	struct inode		*inode,
647
	xfs_off_t		offset,
648
	struct page		*page,
649
	struct xfs_writepage_ctx *wpc,
650
	struct writeback_control *wbc,
651
	struct list_head	*iolist)
652
{
653 654 655 656 657 658 659 660 661 662
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	struct block_device	*bdev = xfs_find_bdev_for_inode(inode);
	unsigned		len = i_blocksize(inode);
	unsigned		poff = offset & (PAGE_SIZE - 1);
	sector_t		sector;

	sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
		((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);

663
	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
664
	    sector != bio_end_sector(wpc->ioend->io_bio) ||
665
	    offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
666 667
		if (wpc->ioend)
			list_add(&wpc->ioend->io_list, iolist);
668 669
		wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
				bdev, sector);
670 671
	}

672
	/*
673 674
	 * If the block doesn't fit into the bio we need to allocate a new
	 * one.  This shouldn't happen more than once for a given block.
675
	 */
676 677
	while (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len)
		xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
678

679
	wpc->ioend->io_size += len;
680 681
}

682 683
STATIC void
xfs_map_buffer(
C
Christoph Hellwig 已提交
684
	struct inode		*inode,
685
	struct buffer_head	*bh,
C
Christoph Hellwig 已提交
686
	struct xfs_bmbt_irec	*imap,
C
Christoph Hellwig 已提交
687
	xfs_off_t		offset)
688 689
{
	sector_t		bn;
690
	struct xfs_mount	*m = XFS_I(inode)->i_mount;
C
Christoph Hellwig 已提交
691 692
	xfs_off_t		iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
	xfs_daddr_t		iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
693

C
Christoph Hellwig 已提交
694 695
	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
696

697
	bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
698
	      ((offset - iomap_offset) >> inode->i_blkbits);
699

C
Christoph Hellwig 已提交
700
	ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
701 702 703 704 705

	bh->b_blocknr = bn;
	set_buffer_mapped(bh);
}

L
Linus Torvalds 已提交
706 707
STATIC void
xfs_map_at_offset(
C
Christoph Hellwig 已提交
708
	struct inode		*inode,
L
Linus Torvalds 已提交
709
	struct buffer_head	*bh,
C
Christoph Hellwig 已提交
710
	struct xfs_bmbt_irec	*imap,
C
Christoph Hellwig 已提交
711
	xfs_off_t		offset)
L
Linus Torvalds 已提交
712
{
C
Christoph Hellwig 已提交
713 714
	ASSERT(imap->br_startblock != HOLESTARTBLOCK);
	ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
L
Linus Torvalds 已提交
715

716
	lock_buffer(bh);
C
Christoph Hellwig 已提交
717
	xfs_map_buffer(inode, bh, imap, offset);
L
Linus Torvalds 已提交
718 719
	set_buffer_mapped(bh);
	clear_buffer_delay(bh);
720
	clear_buffer_unwritten(bh);
721 722 723 724 725 726 727 728

	/*
	 * If this is a realtime file, data may be on a different device.
	 * to that pointed to from the buffer_head b_bdev currently. We can't
	 * trust that the bufferhead has a already been mapped correctly, so
	 * set the bdev now.
	 */
	bh->b_bdev = xfs_find_bdev_for_inode(inode);
729 730 731 732
	bh->b_end_io = NULL;
	set_buffer_async_write(bh);
	set_buffer_uptodate(bh);
	clear_buffer_dirty(bh);
L
Linus Torvalds 已提交
733 734
}

735 736 737
STATIC void
xfs_vm_invalidatepage(
	struct page		*page,
738 739
	unsigned int		offset,
	unsigned int		length)
740
{
741 742
	trace_xfs_invalidatepage(page->mapping->host, page, offset,
				 length);
743 744 745 746 747 748 749 750

	/*
	 * If we are invalidating the entire page, clear the dirty state from it
	 * so that we can check for attempts to release dirty cached pages in
	 * xfs_vm_releasepage().
	 */
	if (offset == 0 && length >= PAGE_SIZE)
		cancel_dirty_page(page);
751
	block_invalidatepage(page, offset, length);
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772
}

/*
 * If the page has delalloc buffers on it, we need to punch them out before we
 * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 * is done on that same region - the delalloc extent is returned when none is
 * supposed to be there.
 *
 * We prevent this by truncating away the delalloc regions on the page before
 * invalidating it. Because they are delalloc, we can do this without needing a
 * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 * truncation without a transaction as there is no space left for block
 * reservation (typically why we see a ENOSPC in writeback).
 */
STATIC void
xfs_aops_discard_page(
	struct page		*page)
{
	struct inode		*inode = page->mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
773
	struct xfs_mount	*mp = ip->i_mount;
774
	loff_t			offset = page_offset(page);
775 776
	xfs_fileoff_t		start_fsb = XFS_B_TO_FSBT(mp, offset);
	int			error;
777

778
	if (XFS_FORCED_SHUTDOWN(mp))
779 780
		goto out_invalidate;

781
	xfs_alert(mp,
782
		"page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
783 784
			page, ip->i_ino, offset);

785 786 787 788
	error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
			PAGE_SIZE / i_blocksize(inode));
	if (error && !XFS_FORCED_SHUTDOWN(mp))
		xfs_alert(mp, "page discard unable to remove delalloc mapping.");
789
out_invalidate:
790
	xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
791 792
}

793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808
/*
 * We implement an immediate ioend submission policy here to avoid needing to
 * chain multiple ioends and hence nest mempool allocations which can violate
 * forward progress guarantees we need to provide. The current ioend we are
 * adding buffers to is cached on the writepage context, and if the new buffer
 * does not append to the cached ioend it will create a new ioend and cache that
 * instead.
 *
 * If a new ioend is created and cached, the old ioend is returned and queued
 * locally for submission once the entire page is processed or an error has been
 * detected.  While ioends are submitted immediately after they are completed,
 * batching optimisations are provided by higher level block plugging.
 *
 * At the end of a writeback pass, there will be a cached ioend remaining on the
 * writepage context that the caller will need to submit.
 */
809 810 811
static int
xfs_writepage_map(
	struct xfs_writepage_ctx *wpc,
812
	struct writeback_control *wbc,
813 814
	struct inode		*inode,
	struct page		*page,
815
	uint64_t		end_offset)
816
{
817 818
	LIST_HEAD(submit_list);
	struct xfs_ioend	*ioend, *next;
819
	struct buffer_head	*bh;
F
Fabian Frederick 已提交
820
	ssize_t			len = i_blocksize(inode);
821
	uint64_t		file_offset;	/* file offset of page */
822
	unsigned		poffset;	/* offset into page */
823 824 825
	int			error = 0;
	int			count = 0;

826 827 828 829 830 831 832
	/*
	 * Walk the blocks on the page, and if we run off the end of the current
	 * map or find the current map invalid, grab a new one.  We only use
	 * bufferheads here to check per-block state - they no longer control
	 * the iteration through the page. This allows us to replace the
	 * bufferhead with some other state tracking mechanism in future.
	 */
833
	file_offset = page_offset(page);
834 835 836 837 838
	bh = page_buffers(page);
	for (poffset = 0;
	     poffset < PAGE_SIZE;
	     poffset += len, file_offset += len, bh = bh->b_this_page) {
		/* past the range we are writing, so nothing more to write. */
839
		if (file_offset >= end_offset)
840 841
			break;

842
		if (!buffer_uptodate(bh)) {
843 844 845 846 847
			if (PageUptodate(page))
				ASSERT(buffer_mapped(bh));
			continue;
		}

848 849 850 851
		error = xfs_map_blocks(wpc, inode, file_offset);
		if (error)
			break;
		if (wpc->io_type == XFS_IO_HOLE)
C
Christoph Hellwig 已提交
852 853
			continue;

854
		xfs_map_at_offset(inode, bh, &wpc->imap, file_offset);
855 856
		xfs_add_to_ioend(inode, file_offset, page, wpc, wbc,
				&submit_list);
C
Christoph Hellwig 已提交
857
		count++;
858
	}
859

860
	ASSERT(wpc->ioend || list_empty(&submit_list));
861 862

	/*
863 864 865 866 867 868 869 870 871
	 * On error, we have to fail the ioend here because we have locked
	 * buffers in the ioend. If we don't do this, we'll deadlock
	 * invalidating the page as that tries to lock the buffers on the page.
	 * Also, because we may have set pages under writeback, we have to make
	 * sure we run IO completion to mark the error state of the IO
	 * appropriately, so we can't cancel the ioend directly here. That means
	 * we have to mark this page as under writeback if we included any
	 * buffers from it in the ioend chain so that completion treats it
	 * correctly.
872
	 *
873 874 875 876 877
	 * If we didn't include the page in the ioend, the on error we can
	 * simply discard and unlock it as there are no other users of the page
	 * or it's buffers right now. The caller will still need to trigger
	 * submission of outstanding ioends on the writepage context so they are
	 * treated correctly on error.
878
	 */
879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895
	if (count) {
		xfs_start_page_writeback(page, !error);

		/*
		 * Preserve the original error if there was one, otherwise catch
		 * submission errors here and propagate into subsequent ioend
		 * submissions.
		 */
		list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
			int error2;

			list_del_init(&ioend->io_list);
			error2 = xfs_submit_ioend(wbc, ioend, error);
			if (error2 && !error)
				error = error2;
		}
	} else if (error) {
896 897 898
		xfs_aops_discard_page(page);
		ClearPageUptodate(page);
		unlock_page(page);
899 900 901 902 903 904 905 906
	} else {
		/*
		 * We can end up here with no error and nothing to write if we
		 * race with a partial page truncate on a sub-page block sized
		 * filesystem. In that case we need to mark the page clean.
		 */
		xfs_start_page_writeback(page, 1);
		end_page_writeback(page);
907
	}
908

909 910 911 912
	mapping_set_error(page->mapping, error);
	return error;
}

L
Linus Torvalds 已提交
913
/*
914 915 916 917 918 919
 * Write out a dirty page.
 *
 * For delalloc space on the page we need to allocate space and flush it.
 * For unwritten space on the page we need to start the conversion to
 * regular allocated space.
 * For any other dirty buffer heads on the page we should flush them.
L
Linus Torvalds 已提交
920 921
 */
STATIC int
922
xfs_do_writepage(
923
	struct page		*page,
924 925
	struct writeback_control *wbc,
	void			*data)
L
Linus Torvalds 已提交
926
{
927
	struct xfs_writepage_ctx *wpc = data;
928
	struct inode		*inode = page->mapping->host;
L
Linus Torvalds 已提交
929
	loff_t			offset;
930
	uint64_t              end_offset;
931
	pgoff_t                 end_index;
932

933
	trace_xfs_writepage(inode, page, 0, 0);
934

935 936
	ASSERT(page_has_buffers(page));

937 938 939
	/*
	 * Refuse to write the page out if we are called from reclaim context.
	 *
940 941 942
	 * This avoids stack overflows when called from deeply used stacks in
	 * random callers for direct reclaim or memcg reclaim.  We explicitly
	 * allow reclaim from kswapd as the stack usage there is relatively low.
943
	 *
944 945
	 * This should never happen except in the case of a VM regression so
	 * warn about it.
946
	 */
947 948
	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
			PF_MEMALLOC))
949
		goto redirty;
L
Linus Torvalds 已提交
950

951
	/*
952 953
	 * Given that we do not allow direct reclaim to call us, we should
	 * never be called while in a filesystem transaction.
954
	 */
955
	if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
956
		goto redirty;
957

958
	/*
959 960
	 * Is this page beyond the end of the file?
	 *
961 962 963 964 965 966 967 968 969 970
	 * The page index is less than the end_index, adjust the end_offset
	 * to the highest offset that this page should represent.
	 * -----------------------------------------------------
	 * |			file mapping	       | <EOF> |
	 * -----------------------------------------------------
	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
	 * ^--------------------------------^----------|--------
	 * |     desired writeback range    |      see else    |
	 * ---------------------------------^------------------|
	 */
971
	offset = i_size_read(inode);
972
	end_index = offset >> PAGE_SHIFT;
973
	if (page->index < end_index)
974
		end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
975 976 977 978 979 980 981 982 983 984 985 986
	else {
		/*
		 * Check whether the page to write out is beyond or straddles
		 * i_size or not.
		 * -------------------------------------------------------
		 * |		file mapping		        | <EOF>  |
		 * -------------------------------------------------------
		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
		 * ^--------------------------------^-----------|---------
		 * |				    |      Straddles     |
		 * ---------------------------------^-----------|--------|
		 */
987
		unsigned offset_into_page = offset & (PAGE_SIZE - 1);
988 989

		/*
990 991 992 993
		 * Skip the page if it is fully outside i_size, e.g. due to a
		 * truncate operation that is in progress. We must redirty the
		 * page so that reclaim stops reclaiming it. Otherwise
		 * xfs_vm_releasepage() is called on it and gets confused.
994 995 996 997 998 999 1000 1001 1002 1003 1004
		 *
		 * Note that the end_index is unsigned long, it would overflow
		 * if the given offset is greater than 16TB on 32-bit system
		 * and if we do check the page is fully outside i_size or not
		 * via "if (page->index >= end_index + 1)" as "end_index + 1"
		 * will be evaluated to 0.  Hence this page will be redirtied
		 * and be written out repeatedly which would result in an
		 * infinite loop, the user program that perform this operation
		 * will hang.  Instead, we can verify this situation by checking
		 * if the page to write is totally beyond the i_size or if it's
		 * offset is just equal to the EOF.
1005
		 */
1006 1007
		if (page->index > end_index ||
		    (page->index == end_index && offset_into_page == 0))
1008
			goto redirty;
1009 1010 1011 1012 1013

		/*
		 * The page straddles i_size.  It must be zeroed out on each
		 * and every writepage invocation because it may be mmapped.
		 * "A file is mapped in multiples of the page size.  For a file
1014
		 * that is not a multiple of the page size, the remaining
1015 1016 1017
		 * memory is zeroed when mapped, and writes to that region are
		 * not written out to the file."
		 */
1018
		zero_user_segment(page, offset_into_page, PAGE_SIZE);
1019 1020 1021

		/* Adjust the end_offset to the end of file */
		end_offset = offset;
L
Linus Torvalds 已提交
1022 1023
	}

1024
	return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
1025

1026
redirty:
1027 1028 1029 1030 1031
	redirty_page_for_writepage(wbc, page);
	unlock_page(page);
	return 0;
}

1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042
STATIC int
xfs_vm_writepage(
	struct page		*page,
	struct writeback_control *wbc)
{
	struct xfs_writepage_ctx wpc = {
		.io_type = XFS_IO_INVALID,
	};
	int			ret;

	ret = xfs_do_writepage(page, wbc, &wpc);
1043 1044 1045
	if (wpc.ioend)
		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
	return ret;
1046 1047
}

1048 1049 1050 1051 1052
STATIC int
xfs_vm_writepages(
	struct address_space	*mapping,
	struct writeback_control *wbc)
{
1053 1054 1055 1056 1057
	struct xfs_writepage_ctx wpc = {
		.io_type = XFS_IO_INVALID,
	};
	int			ret;

1058
	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1059
	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1060 1061 1062
	if (wpc.ioend)
		ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
	return ret;
1063 1064
}

D
Dan Williams 已提交
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
STATIC int
xfs_dax_writepages(
	struct address_space	*mapping,
	struct writeback_control *wbc)
{
	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
	return dax_writeback_mapping_range(mapping,
			xfs_find_bdev_for_inode(mapping->host), wbc);
}

1075 1076
/*
 * Called to move a page into cleanable state - and from there
1077
 * to be released. The page should already be clean. We always
1078 1079
 * have buffer heads in this call.
 *
1080
 * Returns 1 if the page is ok to release, 0 otherwise.
1081 1082
 */
STATIC int
1083
xfs_vm_releasepage(
1084 1085 1086
	struct page		*page,
	gfp_t			gfp_mask)
{
1087
	int			delalloc, unwritten;
1088

1089
	trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1090

1091 1092 1093 1094
	/*
	 * mm accommodates an old ext3 case where clean pages might not have had
	 * the dirty bit cleared. Thus, it can send actual dirty pages to
	 * ->releasepage() via shrink_active_list(). Conversely,
1095 1096
	 * block_invalidatepage() can send pages that are still marked dirty but
	 * otherwise have invalidated buffers.
1097
	 *
1098
	 * We want to release the latter to avoid unnecessary buildup of the
1099 1100 1101 1102 1103 1104 1105
	 * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
	 * that are entirely invalidated and need to be released.  Hence the
	 * only time we should get dirty pages here is through
	 * shrink_active_list() and so we can simply skip those now.
	 *
	 * warn if we've left any lingering delalloc/unwritten buffers on clean
	 * or invalidated pages we are about to release.
1106
	 */
1107 1108 1109
	if (PageDirty(page))
		return 0;

1110
	xfs_count_page_state(page, &delalloc, &unwritten);
1111

1112
	if (WARN_ON_ONCE(delalloc))
1113
		return 0;
1114
	if (WARN_ON_ONCE(unwritten))
1115 1116 1117 1118 1119
		return 0;

	return try_to_free_buffers(page);
}

1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
/*
 * If this is O_DIRECT or the mpage code calling tell them how large the mapping
 * is, so that we can avoid repeated get_blocks calls.
 *
 * If the mapping spans EOF, then we have to break the mapping up as the mapping
 * for blocks beyond EOF must be marked new so that sub block regions can be
 * correctly zeroed. We can't do this for mappings within EOF unless the mapping
 * was just allocated or is unwritten, otherwise the callers would overwrite
 * existing data with zeros. Hence we have to split the mapping into a range up
 * to and including EOF, and a second mapping for beyond EOF.
 */
static void
xfs_map_trim_size(
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
	struct xfs_bmbt_irec	*imap,
	xfs_off_t		offset,
	ssize_t			size)
{
	xfs_off_t		mapping_size;

	mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
	mapping_size <<= inode->i_blkbits;

	ASSERT(mapping_size > 0);
	if (mapping_size > size)
		mapping_size = size;
	if (offset < i_size_read(inode) &&
D
Darrick J. Wong 已提交
1149
	    (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
1150 1151
		/* limit mapping to block that spans EOF */
		mapping_size = roundup_64(i_size_read(inode) - offset,
F
Fabian Frederick 已提交
1152
					  i_blocksize(inode));
1153 1154 1155 1156 1157 1158 1159
	}
	if (mapping_size > LONG_MAX)
		mapping_size = LONG_MAX;

	bh_result->b_size = mapping_size;
}

1160
static int
C
Christoph Hellwig 已提交
1161
xfs_get_blocks(
L
Linus Torvalds 已提交
1162 1163 1164
	struct inode		*inode,
	sector_t		iblock,
	struct buffer_head	*bh_result,
C
Christoph Hellwig 已提交
1165
	int			create)
L
Linus Torvalds 已提交
1166
{
C
Christoph Hellwig 已提交
1167 1168 1169 1170 1171
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_fileoff_t		offset_fsb, end_fsb;
	int			error = 0;
	int			lockmode = 0;
C
Christoph Hellwig 已提交
1172
	struct xfs_bmbt_irec	imap;
C
Christoph Hellwig 已提交
1173
	int			nimaps = 1;
1174 1175
	xfs_off_t		offset;
	ssize_t			size;
C
Christoph Hellwig 已提交
1176

C
Christoph Hellwig 已提交
1177
	BUG_ON(create);
1178

C
Christoph Hellwig 已提交
1179
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
1180
		return -EIO;
L
Linus Torvalds 已提交
1181

1182
	offset = (xfs_off_t)iblock << inode->i_blkbits;
F
Fabian Frederick 已提交
1183
	ASSERT(bh_result->b_size >= i_blocksize(inode));
1184
	size = bh_result->b_size;
1185

C
Christoph Hellwig 已提交
1186
	if (offset >= i_size_read(inode))
1187 1188
		return 0;

1189 1190
	/*
	 * Direct I/O is usually done on preallocated files, so try getting
1191
	 * a block mapping without an exclusive lock first.
1192
	 */
1193
	lockmode = xfs_ilock_data_map_shared(ip);
1194

D
Dave Chinner 已提交
1195
	ASSERT(offset <= mp->m_super->s_maxbytes);
1196
	if (offset > mp->m_super->s_maxbytes - size)
D
Dave Chinner 已提交
1197
		size = mp->m_super->s_maxbytes - offset;
C
Christoph Hellwig 已提交
1198 1199 1200
	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
	offset_fsb = XFS_B_TO_FSBT(mp, offset);

1201 1202
	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
			&nimaps, 0);
L
Linus Torvalds 已提交
1203
	if (error)
C
Christoph Hellwig 已提交
1204
		goto out_unlock;
1205
	if (!nimaps) {
C
Christoph Hellwig 已提交
1206 1207 1208
		trace_xfs_get_blocks_notfound(ip, offset, size);
		goto out_unlock;
	}
L
Linus Torvalds 已提交
1209

1210 1211 1212 1213 1214
	trace_xfs_get_blocks_found(ip, offset, size,
		imap.br_state == XFS_EXT_UNWRITTEN ?
			XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
	xfs_iunlock(ip, lockmode);

1215
	/* trim mapping down to size requested */
1216
	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1217

1218 1219 1220 1221
	/*
	 * For unwritten extents do not report a disk address in the buffered
	 * read case (treat as if we're reading into a hole).
	 */
1222
	if (xfs_bmap_is_real_extent(&imap))
1223
		xfs_map_buffer(inode, bh_result, &imap, offset);
L
Linus Torvalds 已提交
1224

1225 1226 1227 1228
	/*
	 * If this is a realtime file, data may be on a different device.
	 * to that pointed to from the buffer_head b_bdev currently.
	 */
C
Christoph Hellwig 已提交
1229
	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
L
Linus Torvalds 已提交
1230
	return 0;
C
Christoph Hellwig 已提交
1231 1232 1233

out_unlock:
	xfs_iunlock(ip, lockmode);
D
Dave Chinner 已提交
1234
	return error;
L
Linus Torvalds 已提交
1235 1236 1237
}

STATIC sector_t
1238
xfs_vm_bmap(
L
Linus Torvalds 已提交
1239 1240 1241
	struct address_space	*mapping,
	sector_t		block)
{
C
Christoph Hellwig 已提交
1242
	struct xfs_inode	*ip = XFS_I(mapping->host);
L
Linus Torvalds 已提交
1243

C
Christoph Hellwig 已提交
1244
	trace_xfs_vm_bmap(ip);
1245 1246 1247

	/*
	 * The swap code (ab-)uses ->bmap to get a block mapping and then
1248
	 * bypasses the file system for actual I/O.  We really can't allow
1249
	 * that on reflinks inodes, so we have to skip out here.  And yes,
1250 1251 1252 1253
	 * 0 is the magic code for a bmap error.
	 *
	 * Since we don't pass back blockdev info, we can't return bmap
	 * information for rt files either.
1254
	 */
1255
	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
1256
		return 0;
C
Christoph Hellwig 已提交
1257
	return iomap_bmap(mapping, block, &xfs_iomap_ops);
L
Linus Torvalds 已提交
1258 1259 1260
}

STATIC int
1261
xfs_vm_readpage(
L
Linus Torvalds 已提交
1262 1263 1264
	struct file		*unused,
	struct page		*page)
{
1265
	trace_xfs_vm_readpage(page->mapping->host, 1);
1266 1267
	if (i_blocksize(page->mapping->host) == PAGE_SIZE)
		return iomap_readpage(page, &xfs_iomap_ops);
1268
	return mpage_readpage(page, xfs_get_blocks);
L
Linus Torvalds 已提交
1269 1270 1271
}

STATIC int
1272
xfs_vm_readpages(
L
Linus Torvalds 已提交
1273 1274 1275 1276 1277
	struct file		*unused,
	struct address_space	*mapping,
	struct list_head	*pages,
	unsigned		nr_pages)
{
1278
	trace_xfs_vm_readpages(mapping->host, nr_pages);
1279 1280
	if (i_blocksize(mapping->host) == PAGE_SIZE)
		return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
1281
	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
L
Linus Torvalds 已提交
1282 1283
}

1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320
/*
 * This is basically a copy of __set_page_dirty_buffers() with one
 * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
 * dirty, we'll never be able to clean them because we don't write buffers
 * beyond EOF, and that means we can't invalidate pages that span EOF
 * that have been marked dirty. Further, the dirty state can leak into
 * the file interior if the file is extended, resulting in all sorts of
 * bad things happening as the state does not match the underlying data.
 *
 * XXX: this really indicates that bufferheads in XFS need to die. Warts like
 * this only exist because of bufferheads and how the generic code manages them.
 */
STATIC int
xfs_vm_set_page_dirty(
	struct page		*page)
{
	struct address_space	*mapping = page->mapping;
	struct inode		*inode = mapping->host;
	loff_t			end_offset;
	loff_t			offset;
	int			newly_dirty;

	if (unlikely(!mapping))
		return !TestSetPageDirty(page);

	end_offset = i_size_read(inode);
	offset = page_offset(page);

	spin_lock(&mapping->private_lock);
	if (page_has_buffers(page)) {
		struct buffer_head *head = page_buffers(page);
		struct buffer_head *bh = head;

		do {
			if (offset < end_offset)
				set_buffer_dirty(bh);
			bh = bh->b_this_page;
F
Fabian Frederick 已提交
1321
			offset += i_blocksize(inode);
1322 1323
		} while (bh != head);
	}
1324
	/*
1325 1326
	 * Lock out page->mem_cgroup migration to keep PageDirty
	 * synchronized with per-memcg dirty page counters.
1327
	 */
J
Johannes Weiner 已提交
1328
	lock_page_memcg(page);
1329 1330 1331
	newly_dirty = !TestSetPageDirty(page);
	spin_unlock(&mapping->private_lock);

M
Matthew Wilcox 已提交
1332 1333
	if (newly_dirty)
		__set_page_dirty(page, mapping, 1);
J
Johannes Weiner 已提交
1334
	unlock_page_memcg(page);
1335 1336
	if (newly_dirty)
		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1337 1338 1339
	return newly_dirty;
}

1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
static int
xfs_iomap_swapfile_activate(
	struct swap_info_struct		*sis,
	struct file			*swap_file,
	sector_t			*span)
{
	sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
	return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
}

1350
const struct address_space_operations xfs_address_space_operations = {
1351 1352 1353
	.readpage		= xfs_vm_readpage,
	.readpages		= xfs_vm_readpages,
	.writepage		= xfs_vm_writepage,
1354
	.writepages		= xfs_vm_writepages,
1355
	.set_page_dirty		= xfs_vm_set_page_dirty,
1356 1357
	.releasepage		= xfs_vm_releasepage,
	.invalidatepage		= xfs_vm_invalidatepage,
1358
	.bmap			= xfs_vm_bmap,
D
Dan Williams 已提交
1359
	.direct_IO		= noop_direct_IO,
1360
	.migratepage		= buffer_migrate_page,
1361
	.is_partially_uptodate  = block_is_partially_uptodate,
1362
	.error_remove_page	= generic_error_remove_page,
1363
	.swap_activate		= xfs_iomap_swapfile_activate,
L
Linus Torvalds 已提交
1364
};
D
Dan Williams 已提交
1365 1366 1367 1368 1369 1370

const struct address_space_operations xfs_dax_aops = {
	.writepages		= xfs_dax_writepages,
	.direct_IO		= noop_direct_IO,
	.set_page_dirty		= noop_set_page_dirty,
	.invalidatepage		= noop_invalidatepage,
1371
	.swap_activate		= xfs_iomap_swapfile_activate,
D
Dan Williams 已提交
1372
};