xfs_file.c 34.2 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3 4
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
L
Linus Torvalds 已提交
5 6
 */
#include "xfs.h"
7
#include "xfs_fs.h"
8
#include "xfs_shared.h"
9
#include "xfs_format.h"
10 11
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
L
Linus Torvalds 已提交
12 13
#include "xfs_mount.h"
#include "xfs_inode.h"
14
#include "xfs_trans.h"
15
#include "xfs_inode_item.h"
16
#include "xfs_bmap.h"
D
Dave Chinner 已提交
17
#include "xfs_bmap_util.h"
18
#include "xfs_dir2.h"
D
Dave Chinner 已提交
19
#include "xfs_dir2_priv.h"
20
#include "xfs_ioctl.h"
21
#include "xfs_trace.h"
22
#include "xfs_log.h"
23
#include "xfs_icache.h"
24
#include "xfs_pnfs.h"
25
#include "xfs_iomap.h"
26
#include "xfs_reflink.h"
L
Linus Torvalds 已提交
27

28
#include <linux/falloc.h>
29
#include <linux/backing-dev.h>
30
#include <linux/mman.h>
31
#include <linux/fadvise.h>
L
Linus Torvalds 已提交
32

33
static const struct vm_operations_struct xfs_file_vm_ops;
L
Linus Torvalds 已提交
34

35 36 37 38 39 40 41 42
int
xfs_update_prealloc_flags(
	struct xfs_inode	*ip,
	enum xfs_prealloc_flags	flags)
{
	struct xfs_trans	*tp;
	int			error;

43 44 45
	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
			0, 0, 0, &tp);
	if (error)
46 47 48 49 50 51
		return error;

	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);

	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
D
Dave Chinner 已提交
52 53 54
		VFS_I(ip)->i_mode &= ~S_ISUID;
		if (VFS_I(ip)->i_mode & S_IXGRP)
			VFS_I(ip)->i_mode &= ~S_ISGID;
55 56 57 58 59 60 61 62 63 64 65
		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	if (flags & XFS_PREALLOC_SET)
		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
	if (flags & XFS_PREALLOC_CLEAR)
		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	if (flags & XFS_PREALLOC_SYNC)
		xfs_trans_set_sync(tp);
66
	return xfs_trans_commit(tp);
67 68
}

69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
/*
 * Fsync operations on directories are much simpler than on regular files,
 * as there is no file data to flush, and thus also no need for explicit
 * cache flush operations, and there are no non-transaction metadata updates
 * on directories either.
 */
STATIC int
xfs_dir_fsync(
	struct file		*file,
	loff_t			start,
	loff_t			end,
	int			datasync)
{
	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);

	trace_xfs_dir_fsync(ip);
85
	return xfs_log_force_inode(ip);
86 87
}

88 89 90
STATIC int
xfs_file_fsync(
	struct file		*file,
91 92
	loff_t			start,
	loff_t			end,
93 94
	int			datasync)
{
95 96
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
97
	struct xfs_mount	*mp = ip->i_mount;
98 99
	int			error = 0;
	int			log_flushed = 0;
100
	xfs_lsn_t		lsn = 0;
101

C
Christoph Hellwig 已提交
102
	trace_xfs_file_fsync(ip);
103

104
	error = file_write_and_wait_range(file, start, end);
105 106 107
	if (error)
		return error;

108
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
109
		return -EIO;
110 111 112

	xfs_iflags_clear(ip, XFS_ITRUNCATED);

113 114 115 116 117 118 119 120 121 122
	/*
	 * If we have an RT and/or log subvolume we need to make sure to flush
	 * the write cache the device used for file data first.  This is to
	 * ensure newly written file data make it to disk before logging the new
	 * inode size in case of an extending write.
	 */
	if (XFS_IS_REALTIME_INODE(ip))
		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
	else if (mp->m_logdev_targp != mp->m_ddev_targp)
		xfs_blkdev_issue_flush(mp->m_ddev_targp);
123

124
	/*
125 126 127 128 129 130 131 132 133 134 135
	 * All metadata updates are logged, which means that we just have to
	 * flush the log up to the latest LSN that touched the inode. If we have
	 * concurrent fsync/fdatasync() calls, we need them to all block on the
	 * log force before we clear the ili_fsync_fields field. This ensures
	 * that we don't get a racing sync operation that does not wait for the
	 * metadata to hit the journal before returning. If we race with
	 * clearing the ili_fsync_fields, then all that will happen is the log
	 * force will do nothing as the lsn will already be on disk. We can't
	 * race with setting ili_fsync_fields because that is done under
	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
	 * until after the ili_fsync_fields is cleared.
136 137
	 */
	xfs_ilock(ip, XFS_ILOCK_SHARED);
138 139
	if (xfs_ipincount(ip)) {
		if (!datasync ||
140
		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
141 142
			lsn = ip->i_itemp->ili_last_lsn;
	}
143

144
	if (lsn) {
145
		error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
146 147 148
		ip->i_itemp->ili_fsync_fields = 0;
	}
	xfs_iunlock(ip, XFS_ILOCK_SHARED);
149

150 151 152 153 154 155 156
	/*
	 * If we only have a single device, and the log force about was
	 * a no-op we might have to flush the data device cache here.
	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
	 * an already allocated file and thus do not have any metadata to
	 * commit.
	 */
157 158
	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
	    mp->m_logdev_targp == mp->m_ddev_targp)
159
		xfs_blkdev_issue_flush(mp->m_ddev_targp);
160

D
Dave Chinner 已提交
161
	return error;
162 163
}

164
STATIC ssize_t
165
xfs_file_dio_aio_read(
166
	struct kiocb		*iocb,
A
Al Viro 已提交
167
	struct iov_iter		*to)
168
{
C
Christoph Hellwig 已提交
169
	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
170
	size_t			count = iov_iter_count(to);
C
Christoph Hellwig 已提交
171
	ssize_t			ret;
172

173
	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
174

175 176
	if (!count)
		return 0; /* skip atime */
177

178 179
	file_accessed(iocb->ki_filp);

180 181 182 183 184 185
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
			return -EAGAIN;
	} else {
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}
186 187
	ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL,
			is_sync_kiocb(iocb));
188
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
C
Christoph Hellwig 已提交
189

190 191 192
	return ret;
}

193
static noinline ssize_t
194 195 196 197
xfs_file_dax_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
198
	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
199 200 201 202 203 204 205 206
	size_t			count = iov_iter_count(to);
	ssize_t			ret = 0;

	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);

	if (!count)
		return 0; /* skip atime */

C
Christoph Hellwig 已提交
207 208
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
G
Goldwyn Rodrigues 已提交
209
			return -EAGAIN;
C
Christoph Hellwig 已提交
210
	} else {
G
Goldwyn Rodrigues 已提交
211 212
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}
C
Christoph Hellwig 已提交
213

214
	ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
215
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
216

217
	file_accessed(iocb->ki_filp);
218 219 220 221 222 223 224 225 226 227 228 229
	return ret;
}

STATIC ssize_t
xfs_file_buffered_aio_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
	ssize_t			ret;

	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
230

C
Christoph Hellwig 已提交
231 232
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
233
			return -EAGAIN;
C
Christoph Hellwig 已提交
234
	} else {
235 236
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}
A
Al Viro 已提交
237
	ret = generic_file_read_iter(iocb, to);
238
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
239 240 241 242 243 244 245 246 247

	return ret;
}

STATIC ssize_t
xfs_file_read_iter(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
248 249
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
250 251 252 253 254 255 256
	ssize_t			ret = 0;

	XFS_STATS_INC(mp, xs_read_calls);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

257 258 259
	if (IS_DAX(inode))
		ret = xfs_file_dax_read(iocb, to);
	else if (iocb->ki_flags & IOCB_DIRECT)
260
		ret = xfs_file_dio_aio_read(iocb, to);
C
Christoph Hellwig 已提交
261
	else
262
		ret = xfs_file_buffered_aio_read(iocb, to);
263 264

	if (ret > 0)
265
		XFS_STATS_ADD(mp, xs_read_bytes, ret);
266 267 268
	return ret;
}

269 270 271
/*
 * Common pre-write limit and setup checks.
 *
272 273 274
 * Called with the iolocked held either shared and exclusive according to
 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 * if called for a direct write beyond i_size.
275 276 277
 */
STATIC ssize_t
xfs_file_aio_write_checks(
278 279
	struct kiocb		*iocb,
	struct iov_iter		*from,
280 281
	int			*iolock)
{
282
	struct file		*file = iocb->ki_filp;
283 284
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
285
	ssize_t			error = 0;
286
	size_t			count = iov_iter_count(from);
287
	bool			drained_dio = false;
C
Christoph Hellwig 已提交
288
	loff_t			isize;
289

290
restart:
291 292
	error = generic_write_checks(iocb, from);
	if (error <= 0)
293 294
		return error;

295
	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
296 297 298
	if (error)
		return error;

299 300 301 302
	/*
	 * For changing security info in file_remove_privs() we need i_rwsem
	 * exclusively.
	 */
303
	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
304
		xfs_iunlock(ip, *iolock);
305
		*iolock = XFS_IOLOCK_EXCL;
306
		xfs_ilock(ip, *iolock);
307 308
		goto restart;
	}
309 310 311
	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
312
	 * write.  If zeroing is needed and we are currently holding the
313 314
	 * iolock shared, we need to update it to exclusive which implies
	 * having to redo all checks before.
315 316 317 318 319 320 321 322
	 *
	 * We need to serialise against EOF updates that occur in IO
	 * completions here. We want to make sure that nobody is changing the
	 * size while we do this check until we have placed an IO barrier (i.e.
	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
	 * The spinlock effectively forms a memory barrier once we have the
	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
	 * and hence be able to correctly determine if we need to run zeroing.
323
	 */
324
	spin_lock(&ip->i_flags_lock);
C
Christoph Hellwig 已提交
325 326
	isize = i_size_read(inode);
	if (iocb->ki_pos > isize) {
327
		spin_unlock(&ip->i_flags_lock);
328 329
		if (!drained_dio) {
			if (*iolock == XFS_IOLOCK_SHARED) {
330
				xfs_iunlock(ip, *iolock);
331
				*iolock = XFS_IOLOCK_EXCL;
332
				xfs_ilock(ip, *iolock);
333 334
				iov_iter_reexpand(from, count);
			}
335 336 337 338 339 340 341 342 343
			/*
			 * We now have an IO submission barrier in place, but
			 * AIO can do EOF updates during IO completion and hence
			 * we now need to wait for all of them to drain. Non-AIO
			 * DIO will have drained before we are given the
			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
			 * no-op.
			 */
			inode_dio_wait(inode);
344
			drained_dio = true;
345 346
			goto restart;
		}
C
Christoph Hellwig 已提交
347 348 349
	
		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
		error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
350
				NULL, &xfs_buffered_write_iomap_ops);
351 352
		if (error)
			return error;
353 354
	} else
		spin_unlock(&ip->i_flags_lock);
355

C
Christoph Hellwig 已提交
356 357 358 359 360 361
	/*
	 * Updating the timestamps will grab the ilock again from
	 * xfs_fs_dirty_inode, so we have to call it after dropping the
	 * lock above.  Eventually we should look into a way to avoid
	 * the pointless lock roundtrip.
	 */
A
Amir Goldstein 已提交
362
	return file_modified(file);
363 364
}

C
Christoph Hellwig 已提交
365 366 367 368
static int
xfs_dio_write_end_io(
	struct kiocb		*iocb,
	ssize_t			size,
369
	int			error,
C
Christoph Hellwig 已提交
370 371 372 373 374
	unsigned		flags)
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_inode	*ip = XFS_I(inode);
	loff_t			offset = iocb->ki_pos;
C
Christoph Hellwig 已提交
375
	unsigned int		nofs_flag;
C
Christoph Hellwig 已提交
376 377 378 379 380 381

	trace_xfs_end_io_direct_write(ip, offset, size);

	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

382 383 384 385
	if (error)
		return error;
	if (!size)
		return 0;
C
Christoph Hellwig 已提交
386

387 388 389 390 391 392
	/*
	 * Capture amount written on completion as we can't reliably account
	 * for it on submission.
	 */
	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);

C
Christoph Hellwig 已提交
393 394 395 396 397 398 399
	/*
	 * We can allocate memory here while doing writeback on behalf of
	 * memory reclaim.  To avoid memory allocation deadlocks set the
	 * task-wide nofs context for the following operations.
	 */
	nofs_flag = memalloc_nofs_save();

400 401 402
	if (flags & IOMAP_DIO_COW) {
		error = xfs_reflink_end_cow(ip, offset, size);
		if (error)
C
Christoph Hellwig 已提交
403
			goto out;
404 405 406 407 408 409 410 411
	}

	/*
	 * Unwritten conversion updates the in-core isize after extent
	 * conversion but before updating the on-disk size. Updating isize any
	 * earlier allows a racing dio read to find unwritten extents before
	 * they are converted.
	 */
C
Christoph Hellwig 已提交
412 413 414 415
	if (flags & IOMAP_DIO_UNWRITTEN) {
		error = xfs_iomap_write_unwritten(ip, offset, size, true);
		goto out;
	}
416

C
Christoph Hellwig 已提交
417 418 419 420 421 422 423 424 425 426 427 428 429 430
	/*
	 * We need to update the in-core inode size here so that we don't end up
	 * with the on-disk inode size being outside the in-core inode size. We
	 * have no other method of updating EOF for AIO, so always do it here
	 * if necessary.
	 *
	 * We need to lock the test/set EOF update as we can be racing with
	 * other IO completions here to update the EOF. Failing to serialise
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * that occurs.
	 */
	spin_lock(&ip->i_flags_lock);
	if (offset + size > i_size_read(inode)) {
		i_size_write(inode, offset + size);
431
		spin_unlock(&ip->i_flags_lock);
C
Christoph Hellwig 已提交
432
		error = xfs_setfilesize(ip, offset, size);
433 434 435
	} else {
		spin_unlock(&ip->i_flags_lock);
	}
C
Christoph Hellwig 已提交
436

C
Christoph Hellwig 已提交
437 438
out:
	memalloc_nofs_restore(nofs_flag);
C
Christoph Hellwig 已提交
439 440 441
	return error;
}

442 443 444 445
static const struct iomap_dio_ops xfs_dio_write_ops = {
	.end_io		= xfs_dio_write_end_io,
};

446 447 448 449
/*
 * xfs_file_dio_aio_write - handle direct IO writes
 *
 * Lock the inode appropriately to prepare for and issue a direct IO write.
450
 * By separating it from the buffered write path we remove all the tricky to
451 452
 * follow locking changes and looping.
 *
453 454 455 456 457 458 459 460 461 462 463 464 465
 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 * pages are flushed out.
 *
 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 * allowing them to be done in parallel with reads and other direct IO writes.
 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 * needs to do sub-block zeroing and that requires serialisation against other
 * direct IOs to the same block. In this case we need to serialise the
 * submission of the unaligned IOs so that we don't get racing block zeroing in
 * the dio layer.  To avoid the problem with aio, we also need to wait for
 * outstanding IOs to complete so that unwritten extent conversion is completed
 * before we try to map the overlapping block. This is currently implemented by
C
Christoph Hellwig 已提交
466
 * hitting it with a big hammer (i.e. inode_dio_wait()).
467
 *
468 469 470 471 472 473
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
 */
STATIC ssize_t
xfs_file_dio_aio_write(
	struct kiocb		*iocb,
474
	struct iov_iter		*from)
475 476 477 478 479 480 481
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0;
482
	int			unaligned_io = 0;
483
	int			iolock;
484
	size_t			count = iov_iter_count(from);
485
	struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
486

487
	/* DIO must be aligned to device logical sector size */
488
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
E
Eric Sandeen 已提交
489
		return -EINVAL;
490

491
	/*
492 493 494 495 496
	 * Don't take the exclusive iolock here unless the I/O is unaligned to
	 * the file system block size.  We don't need to consider the EOF
	 * extension case here because xfs_file_aio_write_checks() will relock
	 * the inode as necessary for EOF zeroing cases and fill out the new
	 * inode size as appropriate.
497
	 */
498 499 500
	if ((iocb->ki_pos & mp->m_blockmask) ||
	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
		unaligned_io = 1;
501 502 503 504 505

		/*
		 * We can't properly handle unaligned direct I/O to reflink
		 * files yet, as we can't unshare a partial block.
		 */
506
		if (xfs_is_cow_inode(ip)) {
507 508 509
			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
			return -EREMCHG;
		}
510
		iolock = XFS_IOLOCK_EXCL;
511
	} else {
512
		iolock = XFS_IOLOCK_SHARED;
513
	}
514

C
Christoph Hellwig 已提交
515
	if (iocb->ki_flags & IOCB_NOWAIT) {
516 517 518
		/* unaligned dio always waits, bail */
		if (unaligned_io)
			return -EAGAIN;
C
Christoph Hellwig 已提交
519
		if (!xfs_ilock_nowait(ip, iolock))
G
Goldwyn Rodrigues 已提交
520
			return -EAGAIN;
C
Christoph Hellwig 已提交
521
	} else {
G
Goldwyn Rodrigues 已提交
522 523
		xfs_ilock(ip, iolock);
	}
524

525
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
526
	if (ret)
527
		goto out;
528
	count = iov_iter_count(from);
529

530
	/*
531 532 533 534 535
	 * If we are doing unaligned IO, we can't allow any other overlapping IO
	 * in-flight at the same time or we risk data corruption. Wait for all
	 * other IO to drain before we submit. If the IO is aligned, demote the
	 * iolock if we had to take the exclusive lock in
	 * xfs_file_aio_write_checks() for other reasons.
536
	 */
G
Goldwyn Rodrigues 已提交
537
	if (unaligned_io) {
538
		inode_dio_wait(inode);
G
Goldwyn Rodrigues 已提交
539
	} else if (iolock == XFS_IOLOCK_EXCL) {
540
		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
541
		iolock = XFS_IOLOCK_SHARED;
542 543
	}

C
Christoph Hellwig 已提交
544
	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
545
	/*
546 547
	 * If unaligned, this is the only IO in-flight. Wait on it before we
	 * release the iolock to prevent subsequent overlapping IO.
548
	 */
549 550
	ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
			   &xfs_dio_write_ops,
551
			   is_sync_kiocb(iocb) || unaligned_io);
552
out:
553
	xfs_iunlock(ip, iolock);
554

555
	/*
556 557
	 * No fallback to buffered IO on errors for XFS, direct IO will either
	 * complete fully or fail.
558
	 */
559 560 561 562
	ASSERT(ret < 0 || ret == count);
	return ret;
}

563
static noinline ssize_t
564 565 566 567
xfs_file_dax_write(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
568
	struct inode		*inode = iocb->ki_filp->f_mapping->host;
569
	struct xfs_inode	*ip = XFS_I(inode);
570
	int			iolock = XFS_IOLOCK_EXCL;
571 572 573
	ssize_t			ret, error = 0;
	size_t			count;
	loff_t			pos;
574

C
Christoph Hellwig 已提交
575 576
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, iolock))
G
Goldwyn Rodrigues 已提交
577
			return -EAGAIN;
C
Christoph Hellwig 已提交
578
	} else {
G
Goldwyn Rodrigues 已提交
579 580 581
		xfs_ilock(ip, iolock);
	}

582 583 584 585
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
	if (ret)
		goto out;

586 587
	pos = iocb->ki_pos;
	count = iov_iter_count(from);
588

589
	trace_xfs_file_dax_write(ip, count, pos);
590
	ret = dax_iomap_rw(iocb, from, &xfs_direct_write_iomap_ops);
591 592 593
	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
		i_size_write(inode, iocb->ki_pos);
		error = xfs_setfilesize(ip, pos, ret);
594 595
	}
out:
596
	xfs_iunlock(ip, iolock);
597 598 599 600 601 602 603 604 605 606
	if (error)
		return error;

	if (ret > 0) {
		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);

		/* Handle various SYNC-type writes */
		ret = generic_write_sync(iocb, ret);
	}
	return ret;
607 608
}

609
STATIC ssize_t
610
xfs_file_buffered_aio_write(
611
	struct kiocb		*iocb,
612
	struct iov_iter		*from)
613 614 615 616
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
617
	struct xfs_inode	*ip = XFS_I(inode);
618 619
	ssize_t			ret;
	int			enospc = 0;
620
	int			iolock;
621

622 623 624
	if (iocb->ki_flags & IOCB_NOWAIT)
		return -EOPNOTSUPP;

625 626
write_retry:
	iolock = XFS_IOLOCK_EXCL;
627
	xfs_ilock(ip, iolock);
628

629
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
630
	if (ret)
631
		goto out;
632 633

	/* We can write back this queue in page reclaim */
634
	current->backing_dev_info = inode_to_bdi(inode);
635

C
Christoph Hellwig 已提交
636
	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
637 638
	ret = iomap_file_buffered_write(iocb, from,
			&xfs_buffered_write_iomap_ops);
639
	if (likely(ret >= 0))
640
		iocb->ki_pos += ret;
641

642
	/*
643 644 645 646 647 648 649
	 * If we hit a space limit, try to free up some lingering preallocated
	 * space before returning an error. In the case of ENOSPC, first try to
	 * write back all dirty inodes to free up some of the excess reserved
	 * metadata space. This reduces the chances that the eofblocks scan
	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
	 * also behaves as a filter to prevent too many eofblocks scans from
	 * running at the same time.
650
	 */
651
	if (ret == -EDQUOT && !enospc) {
652
		xfs_iunlock(ip, iolock);
653 654 655
		enospc = xfs_inode_free_quota_eofblocks(ip);
		if (enospc)
			goto write_retry;
656 657 658
		enospc = xfs_inode_free_quota_cowblocks(ip);
		if (enospc)
			goto write_retry;
659
		iolock = 0;
660 661 662
	} else if (ret == -ENOSPC && !enospc) {
		struct xfs_eofblocks eofb = {0};

663
		enospc = 1;
D
Dave Chinner 已提交
664
		xfs_flush_inodes(ip->i_mount);
665 666

		xfs_iunlock(ip, iolock);
667 668
		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
669
		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
D
Dave Chinner 已提交
670
		goto write_retry;
671
	}
672

673
	current->backing_dev_info = NULL;
674
out:
675 676
	if (iolock)
		xfs_iunlock(ip, iolock);
677 678 679 680 681 682

	if (ret > 0) {
		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
		/* Handle various SYNC-type writes */
		ret = generic_write_sync(iocb, ret);
	}
683 684 685 686
	return ret;
}

STATIC ssize_t
A
Al Viro 已提交
687
xfs_file_write_iter(
688
	struct kiocb		*iocb,
A
Al Viro 已提交
689
	struct iov_iter		*from)
690 691 692 693 694 695
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	ssize_t			ret;
A
Al Viro 已提交
696
	size_t			ocount = iov_iter_count(from);
697

698
	XFS_STATS_INC(ip->i_mount, xs_write_calls);
699 700 701 702

	if (ocount == 0)
		return 0;

A
Al Viro 已提交
703 704
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;
705

706
	if (IS_DAX(inode))
707 708 709
		return xfs_file_dax_write(iocb, from);

	if (iocb->ki_flags & IOCB_DIRECT) {
710 711 712 713 714 715
		/*
		 * Allow a directio write to fall back to a buffered
		 * write *only* in the case that we're doing a reflink
		 * CoW.  In all other directio scenarios we do not
		 * allow an operation to fall back to buffered mode.
		 */
A
Al Viro 已提交
716
		ret = xfs_file_dio_aio_write(iocb, from);
717 718
		if (ret != -EREMCHG)
			return ret;
719
	}
720

721
	return xfs_file_buffered_aio_write(iocb, from);
722 723
}

724 725
static void
xfs_wait_dax_page(
726
	struct inode		*inode)
727 728 729 730 731 732 733 734 735 736 737
{
	struct xfs_inode        *ip = XFS_I(inode);

	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
	schedule();
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}

static int
xfs_break_dax_layouts(
	struct inode		*inode,
738
	bool			*retry)
739 740 741 742 743 744 745 746 747
{
	struct page		*page;

	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));

	page = dax_layout_busy_page(inode->i_mapping);
	if (!page)
		return 0;

748
	*retry = true;
749 750
	return ___wait_var_event(&page->_refcount,
			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
751
			0, 0, xfs_wait_dax_page(inode));
752 753
}

754 755 756 757 758 759 760
int
xfs_break_layouts(
	struct inode		*inode,
	uint			*iolock,
	enum layout_break_reason reason)
{
	bool			retry;
761
	int			error;
762 763 764

	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));

765 766 767 768
	do {
		retry = false;
		switch (reason) {
		case BREAK_UNMAP:
769
			error = xfs_break_dax_layouts(inode, &retry);
770 771 772 773 774 775 776 777 778 779 780 781 782
			if (error || retry)
				break;
			/* fall through */
		case BREAK_WRITE:
			error = xfs_break_leased_layouts(inode, iolock, &retry);
			break;
		default:
			WARN_ON_ONCE(1);
			error = -EINVAL;
		}
	} while (error == 0 && retry);

	return error;
783 784
}

785 786 787
#define	XFS_FALLOC_FL_SUPPORTED						\
		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
788
		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
789

790 791
STATIC long
xfs_file_fallocate(
792 793 794 795
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
796
{
797 798 799
	struct inode		*inode = file_inode(file);
	struct xfs_inode	*ip = XFS_I(inode);
	long			error;
800
	enum xfs_prealloc_flags	flags = 0;
801
	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
802
	loff_t			new_size = 0;
803
	bool			do_file_insert = false;
804

805 806
	if (!S_ISREG(inode->i_mode))
		return -EINVAL;
807
	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
808 809
		return -EOPNOTSUPP;

810
	xfs_ilock(ip, iolock);
811
	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
812 813 814
	if (error)
		goto out_unlock;

815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844
	/*
	 * Must wait for all AIO to complete before we continue as AIO can
	 * change the file size on completion without holding any locks we
	 * currently hold. We must do this first because AIO can update both
	 * the on disk and in memory inode sizes, and the operations that follow
	 * require the in-memory size to be fully up-to-date.
	 */
	inode_dio_wait(inode);

	/*
	 * Now AIO and DIO has drained we flush and (if necessary) invalidate
	 * the cached range over the first operation we are about to run.
	 *
	 * We care about zero and collapse here because they both run a hole
	 * punch over the range first. Because that can zero data, and the range
	 * of invalidation for the shift operations is much larger, we still do
	 * the required flush for collapse in xfs_prepare_shift().
	 *
	 * Insert has the same range requirements as collapse, and we extend the
	 * file first which can zero data. Hence insert has the same
	 * flush/invalidate requirements as collapse and so they are both
	 * handled at the right time by xfs_prepare_shift().
	 */
	if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
		    FALLOC_FL_COLLAPSE_RANGE)) {
		error = xfs_flush_unmap_range(ip, offset, len);
		if (error)
			goto out_unlock;
	}

845 846 847 848
	if (mode & FALLOC_FL_PUNCH_HOLE) {
		error = xfs_free_file_space(ip, offset, len);
		if (error)
			goto out_unlock;
849
	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
F
Fabian Frederick 已提交
850
		unsigned int blksize_mask = i_blocksize(inode) - 1;
851 852

		if (offset & blksize_mask || len & blksize_mask) {
D
Dave Chinner 已提交
853
			error = -EINVAL;
854 855 856
			goto out_unlock;
		}

857 858 859 860 861
		/*
		 * There is no need to overlap collapse range with EOF,
		 * in which case it is effectively a truncate operation
		 */
		if (offset + len >= i_size_read(inode)) {
D
Dave Chinner 已提交
862
			error = -EINVAL;
863 864 865
			goto out_unlock;
		}

866 867 868 869 870
		new_size = i_size_read(inode) - len;

		error = xfs_collapse_file_space(ip, offset, len);
		if (error)
			goto out_unlock;
871
	} else if (mode & FALLOC_FL_INSERT_RANGE) {
872 873
		unsigned int	blksize_mask = i_blocksize(inode) - 1;
		loff_t		isize = i_size_read(inode);
874 875 876 877 878 879

		if (offset & blksize_mask || len & blksize_mask) {
			error = -EINVAL;
			goto out_unlock;
		}

880 881 882 883 884
		/*
		 * New inode size must not exceed ->s_maxbytes, accounting for
		 * possible signed overflow.
		 */
		if (inode->i_sb->s_maxbytes - isize < len) {
885 886 887
			error = -EFBIG;
			goto out_unlock;
		}
888
		new_size = isize + len;
889 890

		/* Offset should be less than i_size */
891
		if (offset >= isize) {
892 893 894
			error = -EINVAL;
			goto out_unlock;
		}
895
		do_file_insert = true;
896
	} else {
897 898
		flags |= XFS_PREALLOC_SET;

899 900 901
		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
		    offset + len > i_size_read(inode)) {
			new_size = offset + len;
D
Dave Chinner 已提交
902
			error = inode_newsize_ok(inode, new_size);
903 904 905
			if (error)
				goto out_unlock;
		}
906

907
		if (mode & FALLOC_FL_ZERO_RANGE) {
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927
			/*
			 * Punch a hole and prealloc the range.  We use a hole
			 * punch rather than unwritten extent conversion for two
			 * reasons:
			 *
			 *   1.) Hole punch handles partial block zeroing for us.
			 *   2.) If prealloc returns ENOSPC, the file range is
			 *       still zero-valued by virtue of the hole punch.
			 */
			unsigned int blksize = i_blocksize(inode);

			trace_xfs_zero_file_space(ip);

			error = xfs_free_file_space(ip, offset, len);
			if (error)
				goto out_unlock;

			len = round_up(offset + len, blksize) -
			      round_down(offset, blksize);
			offset = round_down(offset, blksize);
928 929 930 931 932 933 934 935 936 937 938 939 940
		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
			error = xfs_reflink_unshare(ip, offset, len);
			if (error)
				goto out_unlock;
		} else {
			/*
			 * If always_cow mode we can't use preallocations and
			 * thus should not create them.
			 */
			if (xfs_is_always_cow_inode(ip)) {
				error = -EOPNOTSUPP;
				goto out_unlock;
			}
941
		}
942

943
		if (!xfs_is_always_cow_inode(ip)) {
944 945
			error = xfs_alloc_file_space(ip, offset, len,
						     XFS_BMAPI_PREALLOC);
946 947
			if (error)
				goto out_unlock;
948
		}
949 950
	}

951
	if (file->f_flags & O_DSYNC)
952 953 954
		flags |= XFS_PREALLOC_SYNC;

	error = xfs_update_prealloc_flags(ip, flags);
955 956 957 958 959 960 961 962 963
	if (error)
		goto out_unlock;

	/* Change file size if needed */
	if (new_size) {
		struct iattr iattr;

		iattr.ia_valid = ATTR_SIZE;
		iattr.ia_size = new_size;
964
		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
965 966
		if (error)
			goto out_unlock;
967 968
	}

969 970 971 972 973 974 975 976 977
	/*
	 * Perform hole insertion now that the file size has been
	 * updated so that if we crash during the operation we don't
	 * leave shifted extents past EOF and hence losing access to
	 * the data that is contained within them.
	 */
	if (do_file_insert)
		error = xfs_insert_file_space(ip, offset, len);

978
out_unlock:
979
	xfs_iunlock(ip, iolock);
D
Dave Chinner 已提交
980
	return error;
981 982
}

983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
STATIC int
xfs_file_fadvise(
	struct file	*file,
	loff_t		start,
	loff_t		end,
	int		advice)
{
	struct xfs_inode *ip = XFS_I(file_inode(file));
	int ret;
	int lockflags = 0;

	/*
	 * Operations creating pages in page cache need protection from hole
	 * punching and similar ops
	 */
	if (advice == POSIX_FADV_WILLNEED) {
		lockflags = XFS_IOLOCK_SHARED;
		xfs_ilock(ip, lockflags);
	}
	ret = generic_fadvise(file, start, end, advice);
	if (lockflags)
		xfs_iunlock(ip, lockflags);
	return ret;
}
1007

1008
STATIC loff_t
1009
xfs_file_remap_range(
1010 1011 1012 1013 1014 1015
	struct file		*file_in,
	loff_t			pos_in,
	struct file		*file_out,
	loff_t			pos_out,
	loff_t			len,
	unsigned int		remap_flags)
1016
{
1017 1018 1019 1020 1021 1022 1023 1024 1025
	struct inode		*inode_in = file_inode(file_in);
	struct xfs_inode	*src = XFS_I(inode_in);
	struct inode		*inode_out = file_inode(file_out);
	struct xfs_inode	*dest = XFS_I(inode_out);
	struct xfs_mount	*mp = src->i_mount;
	loff_t			remapped = 0;
	xfs_extlen_t		cowextsize;
	int			ret;

1026 1027
	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
		return -EINVAL;
1028

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
	if (!xfs_sb_version_hasreflink(&mp->m_sb))
		return -EOPNOTSUPP;

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

	/* Prepare and then clone file data. */
	ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
			&len, remap_flags);
	if (ret < 0 || len == 0)
		return ret;

	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);

	ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
			&remapped);
	if (ret)
		goto out_unlock;

	/*
	 * Carry the cowextsize hint from src to dest if we're sharing the
	 * entire source file to the entire destination file, the source file
	 * has a cowextsize hint, and the destination file does not.
	 */
	cowextsize = 0;
	if (pos_in == 0 && len == i_size_read(inode_in) &&
	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
	    pos_out == 0 && len >= i_size_read(inode_out) &&
	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
		cowextsize = src->i_d.di_cowextsize;

	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
			remap_flags);
1062 1063
	if (ret)
		goto out_unlock;
1064

1065 1066
	if (mp->m_flags & XFS_MOUNT_WSYNC)
		xfs_log_force_inode(dest);
1067 1068 1069 1070 1071
out_unlock:
	xfs_reflink_remap_unlock(file_in, file_out);
	if (ret)
		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
	return remapped > 0 ? remapped : ret;
1072
}
1073

L
Linus Torvalds 已提交
1074
STATIC int
1075
xfs_file_open(
L
Linus Torvalds 已提交
1076
	struct inode	*inode,
1077
	struct file	*file)
L
Linus Torvalds 已提交
1078
{
1079
	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
L
Linus Torvalds 已提交
1080
		return -EFBIG;
1081 1082
	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
		return -EIO;
1083
	file->f_mode |= FMODE_NOWAIT;
1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103
	return 0;
}

STATIC int
xfs_dir_open(
	struct inode	*inode,
	struct file	*file)
{
	struct xfs_inode *ip = XFS_I(inode);
	int		mode;
	int		error;

	error = xfs_file_open(inode, file);
	if (error)
		return error;

	/*
	 * If there are any blocks, read-ahead block 0 as we're almost
	 * certain to have the next operation be a read there.
	 */
1104
	mode = xfs_ilock_data_map_shared(ip);
1105
	if (ip->i_df.if_nextents > 0)
1106
		error = xfs_dir3_data_readahead(ip, 0, 0);
1107
	xfs_iunlock(ip, mode);
1108
	return error;
L
Linus Torvalds 已提交
1109 1110 1111
}

STATIC int
1112
xfs_file_release(
L
Linus Torvalds 已提交
1113 1114 1115
	struct inode	*inode,
	struct file	*filp)
{
D
Dave Chinner 已提交
1116
	return xfs_release(XFS_I(inode));
L
Linus Torvalds 已提交
1117 1118 1119
}

STATIC int
1120
xfs_file_readdir(
A
Al Viro 已提交
1121 1122
	struct file	*file,
	struct dir_context *ctx)
L
Linus Torvalds 已提交
1123
{
A
Al Viro 已提交
1124
	struct inode	*inode = file_inode(file);
1125
	xfs_inode_t	*ip = XFS_I(inode);
C
Christoph Hellwig 已提交
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137
	size_t		bufsize;

	/*
	 * The Linux API doesn't pass down the total size of the buffer
	 * we read into down to the filesystem.  With the filldir concept
	 * it's not needed for correct information, but the XFS dir2 leaf
	 * code wants an estimate of the buffer size to calculate it's
	 * readahead window and size the buffers used for mapping to
	 * physical blocks.
	 *
	 * Try to give it an estimate that's good enough, maybe at some
	 * point we can change the ->readdir prototype to include the
E
Eric Sandeen 已提交
1138
	 * buffer size.  For now we use the current glibc buffer size.
C
Christoph Hellwig 已提交
1139
	 */
D
Darrick J. Wong 已提交
1140
	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
C
Christoph Hellwig 已提交
1141

1142
	return xfs_readdir(NULL, ip, ctx, bufsize);
1143 1144 1145 1146 1147 1148
}

STATIC loff_t
xfs_file_llseek(
	struct file	*file,
	loff_t		offset,
1149
	int		whence)
1150
{
1151 1152 1153 1154 1155
	struct inode		*inode = file->f_mapping->host;

	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
		return -EIO;

1156
	switch (whence) {
1157
	default:
1158
		return generic_file_llseek(file, offset, whence);
1159
	case SEEK_HOLE:
1160
		offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
1161
		break;
1162
	case SEEK_DATA:
1163
		offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
1164
		break;
1165
	}
1166 1167 1168 1169

	if (offset < 0)
		return offset;
	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1170 1171
}

1172 1173 1174 1175 1176
/*
 * Locking for serialisation of IO during page faults. This results in a lock
 * ordering of:
 *
 * mmap_sem (MM)
1177
 *   sb_start_pagefault(vfs, freeze)
1178
 *     i_mmaplock (XFS - truncate serialisation)
1179 1180
 *       page_lock (MM)
 *         i_lock (XFS - extent map serialisation)
1181
 */
1182
static vm_fault_t
1183 1184 1185 1186
__xfs_filemap_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size,
	bool			write_fault)
1187
{
1188
	struct inode		*inode = file_inode(vmf->vma->vm_file);
1189
	struct xfs_inode	*ip = XFS_I(inode);
1190
	vm_fault_t		ret;
1191

1192
	trace_xfs_filemap_fault(ip, pe_size, write_fault);
1193

1194 1195 1196 1197
	if (write_fault) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vmf->vma->vm_file);
	}
1198

1199
	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1200
	if (IS_DAX(inode)) {
1201 1202
		pfn_t pfn;

1203 1204
		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL,
				(write_fault && !vmf->cow_page) ?
1205 1206
				 &xfs_direct_write_iomap_ops :
				 &xfs_read_iomap_ops);
1207 1208
		if (ret & VM_FAULT_NEEDDSYNC)
			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1209
	} else {
1210
		if (write_fault)
1211 1212
			ret = iomap_page_mkwrite(vmf,
					&xfs_buffered_write_iomap_ops);
1213 1214
		else
			ret = filemap_fault(vmf);
1215 1216 1217
	}
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

1218 1219
	if (write_fault)
		sb_end_pagefault(inode->i_sb);
1220
	return ret;
1221 1222
}

1223
static vm_fault_t
1224
xfs_filemap_fault(
1225 1226
	struct vm_fault		*vmf)
{
1227
	/* DAX can shortcut the normal fault path on write faults! */
1228 1229 1230
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
			IS_DAX(file_inode(vmf->vma->vm_file)) &&
			(vmf->flags & FAULT_FLAG_WRITE));
1231 1232
}

1233
static vm_fault_t
1234
xfs_filemap_huge_fault(
1235 1236
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size)
M
Matthew Wilcox 已提交
1237
{
1238
	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
M
Matthew Wilcox 已提交
1239 1240
		return VM_FAULT_FALLBACK;

1241 1242 1243 1244
	/* DAX can shortcut the normal fault path on write faults! */
	return __xfs_filemap_fault(vmf, pe_size,
			(vmf->flags & FAULT_FLAG_WRITE));
}
M
Matthew Wilcox 已提交
1245

1246
static vm_fault_t
1247 1248 1249 1250
xfs_filemap_page_mkwrite(
	struct vm_fault		*vmf)
{
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
M
Matthew Wilcox 已提交
1251 1252
}

1253
/*
1254 1255 1256
 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
 * on write faults. In reality, it needs to serialise against truncate and
 * prepare memory for writing so handle is as standard write fault.
1257
 */
1258
static vm_fault_t
1259 1260 1261 1262
xfs_filemap_pfn_mkwrite(
	struct vm_fault		*vmf)
{

1263
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
M
Matthew Wilcox 已提交
1264 1265
}

1266 1267
static const struct vm_operations_struct xfs_file_vm_ops = {
	.fault		= xfs_filemap_fault,
1268
	.huge_fault	= xfs_filemap_huge_fault,
1269 1270
	.map_pages	= filemap_map_pages,
	.page_mkwrite	= xfs_filemap_page_mkwrite,
1271
	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1272 1273 1274 1275
};

STATIC int
xfs_file_mmap(
1276 1277
	struct file		*file,
	struct vm_area_struct	*vma)
1278
{
1279 1280
	struct inode		*inode = file_inode(file);
	struct xfs_buftarg	*target = xfs_inode_buftarg(XFS_I(inode));
1281

1282
	/*
1283 1284
	 * We don't support synchronous mappings for non-DAX files and
	 * for DAX files if underneath dax_device is not synchronous.
1285
	 */
1286
	if (!daxdev_mapping_supported(vma, target->bt_daxdev))
1287 1288
		return -EOPNOTSUPP;

1289
	file_accessed(file);
1290
	vma->vm_ops = &xfs_file_vm_ops;
1291
	if (IS_DAX(inode))
1292
		vma->vm_flags |= VM_HUGEPAGE;
1293
	return 0;
1294 1295
}

1296
const struct file_operations xfs_file_operations = {
1297
	.llseek		= xfs_file_llseek,
A
Al Viro 已提交
1298
	.read_iter	= xfs_file_read_iter,
A
Al Viro 已提交
1299
	.write_iter	= xfs_file_write_iter,
1300
	.splice_read	= generic_file_splice_read,
A
Al Viro 已提交
1301
	.splice_write	= iter_file_splice_write,
1302
	.iopoll		= iomap_dio_iopoll,
1303
	.unlocked_ioctl	= xfs_file_ioctl,
L
Linus Torvalds 已提交
1304
#ifdef CONFIG_COMPAT
1305
	.compat_ioctl	= xfs_file_compat_ioctl,
L
Linus Torvalds 已提交
1306
#endif
1307
	.mmap		= xfs_file_mmap,
1308
	.mmap_supported_flags = MAP_SYNC,
1309 1310 1311
	.open		= xfs_file_open,
	.release	= xfs_file_release,
	.fsync		= xfs_file_fsync,
1312
	.get_unmapped_area = thp_get_unmapped_area,
1313
	.fallocate	= xfs_file_fallocate,
1314
	.fadvise	= xfs_file_fadvise,
1315
	.remap_file_range = xfs_file_remap_range,
L
Linus Torvalds 已提交
1316 1317
};

1318
const struct file_operations xfs_dir_file_operations = {
1319
	.open		= xfs_dir_open,
L
Linus Torvalds 已提交
1320
	.read		= generic_read_dir,
1321
	.iterate_shared	= xfs_file_readdir,
1322
	.llseek		= generic_file_llseek,
1323
	.unlocked_ioctl	= xfs_file_ioctl,
1324
#ifdef CONFIG_COMPAT
1325
	.compat_ioctl	= xfs_file_compat_ioctl,
1326
#endif
1327
	.fsync		= xfs_dir_fsync,
L
Linus Torvalds 已提交
1328
};