xfs_file.c 29.9 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3 4
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
L
Linus Torvalds 已提交
5 6
 */
#include "xfs.h"
7
#include "xfs_fs.h"
8
#include "xfs_shared.h"
9
#include "xfs_format.h"
10 11
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
L
Linus Torvalds 已提交
12
#include "xfs_mount.h"
13 14
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
L
Linus Torvalds 已提交
15
#include "xfs_inode.h"
16
#include "xfs_trans.h"
17
#include "xfs_inode_item.h"
18
#include "xfs_bmap.h"
D
Dave Chinner 已提交
19
#include "xfs_bmap_util.h"
L
Linus Torvalds 已提交
20
#include "xfs_error.h"
21
#include "xfs_dir2.h"
D
Dave Chinner 已提交
22
#include "xfs_dir2_priv.h"
23
#include "xfs_ioctl.h"
24
#include "xfs_trace.h"
25
#include "xfs_log.h"
26
#include "xfs_icache.h"
27
#include "xfs_pnfs.h"
28
#include "xfs_iomap.h"
29
#include "xfs_reflink.h"
L
Linus Torvalds 已提交
30 31

#include <linux/dcache.h>
32
#include <linux/falloc.h>
33
#include <linux/pagevec.h>
34
#include <linux/backing-dev.h>
35
#include <linux/mman.h>
L
Linus Torvalds 已提交
36

37
static const struct vm_operations_struct xfs_file_vm_ops;
L
Linus Torvalds 已提交
38

39 40 41 42 43 44 45 46
int
xfs_update_prealloc_flags(
	struct xfs_inode	*ip,
	enum xfs_prealloc_flags	flags)
{
	struct xfs_trans	*tp;
	int			error;

47 48 49
	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_writeid,
			0, 0, 0, &tp);
	if (error)
50 51 52 53 54 55
		return error;

	xfs_ilock(ip, XFS_ILOCK_EXCL);
	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);

	if (!(flags & XFS_PREALLOC_INVISIBLE)) {
D
Dave Chinner 已提交
56 57 58
		VFS_I(ip)->i_mode &= ~S_ISUID;
		if (VFS_I(ip)->i_mode & S_IXGRP)
			VFS_I(ip)->i_mode &= ~S_ISGID;
59 60 61 62 63 64 65 66 67 68 69
		xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
	}

	if (flags & XFS_PREALLOC_SET)
		ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
	if (flags & XFS_PREALLOC_CLEAR)
		ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;

	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
	if (flags & XFS_PREALLOC_SYNC)
		xfs_trans_set_sync(tp);
70
	return xfs_trans_commit(tp);
71 72
}

73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
/*
 * Fsync operations on directories are much simpler than on regular files,
 * as there is no file data to flush, and thus also no need for explicit
 * cache flush operations, and there are no non-transaction metadata updates
 * on directories either.
 */
STATIC int
xfs_dir_fsync(
	struct file		*file,
	loff_t			start,
	loff_t			end,
	int			datasync)
{
	struct xfs_inode	*ip = XFS_I(file->f_mapping->host);
	struct xfs_mount	*mp = ip->i_mount;
	xfs_lsn_t		lsn = 0;

	trace_xfs_dir_fsync(ip);

	xfs_ilock(ip, XFS_ILOCK_SHARED);
	if (xfs_ipincount(ip))
		lsn = ip->i_itemp->ili_last_lsn;
	xfs_iunlock(ip, XFS_ILOCK_SHARED);

	if (!lsn)
		return 0;
99
	return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
100 101
}

102 103 104
STATIC int
xfs_file_fsync(
	struct file		*file,
105 106
	loff_t			start,
	loff_t			end,
107 108
	int			datasync)
{
109 110
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
111
	struct xfs_mount	*mp = ip->i_mount;
112 113
	int			error = 0;
	int			log_flushed = 0;
114
	xfs_lsn_t		lsn = 0;
115

C
Christoph Hellwig 已提交
116
	trace_xfs_file_fsync(ip);
117

118
	error = file_write_and_wait_range(file, start, end);
119 120 121
	if (error)
		return error;

122
	if (XFS_FORCED_SHUTDOWN(mp))
E
Eric Sandeen 已提交
123
		return -EIO;
124 125 126

	xfs_iflags_clear(ip, XFS_ITRUNCATED);

127 128 129 130 131 132 133 134 135 136
	/*
	 * If we have an RT and/or log subvolume we need to make sure to flush
	 * the write cache the device used for file data first.  This is to
	 * ensure newly written file data make it to disk before logging the new
	 * inode size in case of an extending write.
	 */
	if (XFS_IS_REALTIME_INODE(ip))
		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
	else if (mp->m_logdev_targp != mp->m_ddev_targp)
		xfs_blkdev_issue_flush(mp->m_ddev_targp);
137

138
	/*
139 140 141 142 143 144 145 146 147 148 149
	 * All metadata updates are logged, which means that we just have to
	 * flush the log up to the latest LSN that touched the inode. If we have
	 * concurrent fsync/fdatasync() calls, we need them to all block on the
	 * log force before we clear the ili_fsync_fields field. This ensures
	 * that we don't get a racing sync operation that does not wait for the
	 * metadata to hit the journal before returning. If we race with
	 * clearing the ili_fsync_fields, then all that will happen is the log
	 * force will do nothing as the lsn will already be on disk. We can't
	 * race with setting ili_fsync_fields because that is done under
	 * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
	 * until after the ili_fsync_fields is cleared.
150 151
	 */
	xfs_ilock(ip, XFS_ILOCK_SHARED);
152 153
	if (xfs_ipincount(ip)) {
		if (!datasync ||
154
		    (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
155 156
			lsn = ip->i_itemp->ili_last_lsn;
	}
157

158
	if (lsn) {
159
		error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
160 161 162
		ip->i_itemp->ili_fsync_fields = 0;
	}
	xfs_iunlock(ip, XFS_ILOCK_SHARED);
163

164 165 166 167 168 169 170
	/*
	 * If we only have a single device, and the log force about was
	 * a no-op we might have to flush the data device cache here.
	 * This can only happen for fdatasync/O_DSYNC if we were overwriting
	 * an already allocated file and thus do not have any metadata to
	 * commit.
	 */
171 172
	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
	    mp->m_logdev_targp == mp->m_ddev_targp)
173
		xfs_blkdev_issue_flush(mp->m_ddev_targp);
174

D
Dave Chinner 已提交
175
	return error;
176 177
}

178
STATIC ssize_t
179
xfs_file_dio_aio_read(
180
	struct kiocb		*iocb,
A
Al Viro 已提交
181
	struct iov_iter		*to)
182
{
C
Christoph Hellwig 已提交
183
	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
184
	size_t			count = iov_iter_count(to);
C
Christoph Hellwig 已提交
185
	ssize_t			ret;
186

187
	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
188

189 190
	if (!count)
		return 0; /* skip atime */
191

192 193
	file_accessed(iocb->ki_filp);

194
	xfs_ilock(ip, XFS_IOLOCK_SHARED);
C
Christoph Hellwig 已提交
195
	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
196
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
C
Christoph Hellwig 已提交
197

198 199 200
	return ret;
}

201
static noinline ssize_t
202 203 204 205
xfs_file_dax_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
206
	struct xfs_inode	*ip = XFS_I(iocb->ki_filp->f_mapping->host);
207 208 209 210 211 212 213 214
	size_t			count = iov_iter_count(to);
	ssize_t			ret = 0;

	trace_xfs_file_dax_read(ip, count, iocb->ki_pos);

	if (!count)
		return 0; /* skip atime */

C
Christoph Hellwig 已提交
215 216
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
G
Goldwyn Rodrigues 已提交
217
			return -EAGAIN;
C
Christoph Hellwig 已提交
218
	} else {
G
Goldwyn Rodrigues 已提交
219 220
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}
C
Christoph Hellwig 已提交
221

222
	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
223
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
224

225
	file_accessed(iocb->ki_filp);
226 227 228 229 230 231 232 233 234 235 236 237
	return ret;
}

STATIC ssize_t
xfs_file_buffered_aio_read(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
	ssize_t			ret;

	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
238

C
Christoph Hellwig 已提交
239 240
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
241
			return -EAGAIN;
C
Christoph Hellwig 已提交
242
	} else {
243 244
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}
A
Al Viro 已提交
245
	ret = generic_file_read_iter(iocb, to);
246
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
247 248 249 250 251 252 253 254 255

	return ret;
}

STATIC ssize_t
xfs_file_read_iter(
	struct kiocb		*iocb,
	struct iov_iter		*to)
{
256 257
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_mount	*mp = XFS_I(inode)->i_mount;
258 259 260 261 262 263 264
	ssize_t			ret = 0;

	XFS_STATS_INC(mp, xs_read_calls);

	if (XFS_FORCED_SHUTDOWN(mp))
		return -EIO;

265 266 267
	if (IS_DAX(inode))
		ret = xfs_file_dax_read(iocb, to);
	else if (iocb->ki_flags & IOCB_DIRECT)
268
		ret = xfs_file_dio_aio_read(iocb, to);
C
Christoph Hellwig 已提交
269
	else
270
		ret = xfs_file_buffered_aio_read(iocb, to);
271 272

	if (ret > 0)
273
		XFS_STATS_ADD(mp, xs_read_bytes, ret);
274 275 276
	return ret;
}

277 278 279
/*
 * Common pre-write limit and setup checks.
 *
280 281 282
 * Called with the iolocked held either shared and exclusive according to
 * @iolock, and returns with it held.  Might upgrade the iolock to exclusive
 * if called for a direct write beyond i_size.
283 284 285
 */
STATIC ssize_t
xfs_file_aio_write_checks(
286 287
	struct kiocb		*iocb,
	struct iov_iter		*from,
288 289
	int			*iolock)
{
290
	struct file		*file = iocb->ki_filp;
291 292
	struct inode		*inode = file->f_mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
293
	ssize_t			error = 0;
294
	size_t			count = iov_iter_count(from);
295
	bool			drained_dio = false;
C
Christoph Hellwig 已提交
296
	loff_t			isize;
297

298
restart:
299 300
	error = generic_write_checks(iocb, from);
	if (error <= 0)
301 302
		return error;

303
	error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
304 305 306
	if (error)
		return error;

307 308 309 310
	/*
	 * For changing security info in file_remove_privs() we need i_rwsem
	 * exclusively.
	 */
311
	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
312
		xfs_iunlock(ip, *iolock);
313
		*iolock = XFS_IOLOCK_EXCL;
314
		xfs_ilock(ip, *iolock);
315 316
		goto restart;
	}
317 318 319
	/*
	 * If the offset is beyond the size of the file, we need to zero any
	 * blocks that fall between the existing EOF and the start of this
320
	 * write.  If zeroing is needed and we are currently holding the
321 322
	 * iolock shared, we need to update it to exclusive which implies
	 * having to redo all checks before.
323 324 325 326 327 328 329 330
	 *
	 * We need to serialise against EOF updates that occur in IO
	 * completions here. We want to make sure that nobody is changing the
	 * size while we do this check until we have placed an IO barrier (i.e.
	 * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched.
	 * The spinlock effectively forms a memory barrier once we have the
	 * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value
	 * and hence be able to correctly determine if we need to run zeroing.
331
	 */
332
	spin_lock(&ip->i_flags_lock);
C
Christoph Hellwig 已提交
333 334
	isize = i_size_read(inode);
	if (iocb->ki_pos > isize) {
335
		spin_unlock(&ip->i_flags_lock);
336 337
		if (!drained_dio) {
			if (*iolock == XFS_IOLOCK_SHARED) {
338
				xfs_iunlock(ip, *iolock);
339
				*iolock = XFS_IOLOCK_EXCL;
340
				xfs_ilock(ip, *iolock);
341 342
				iov_iter_reexpand(from, count);
			}
343 344 345 346 347 348 349 350 351
			/*
			 * We now have an IO submission barrier in place, but
			 * AIO can do EOF updates during IO completion and hence
			 * we now need to wait for all of them to drain. Non-AIO
			 * DIO will have drained before we are given the
			 * XFS_IOLOCK_EXCL, and so for most cases this wait is a
			 * no-op.
			 */
			inode_dio_wait(inode);
352
			drained_dio = true;
353 354
			goto restart;
		}
C
Christoph Hellwig 已提交
355 356 357 358
	
		trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
		error = iomap_zero_range(inode, isize, iocb->ki_pos - isize,
				NULL, &xfs_iomap_ops);
359 360
		if (error)
			return error;
361 362
	} else
		spin_unlock(&ip->i_flags_lock);
363

C
Christoph Hellwig 已提交
364 365 366 367 368 369
	/*
	 * Updating the timestamps will grab the ilock again from
	 * xfs_fs_dirty_inode, so we have to call it after dropping the
	 * lock above.  Eventually we should look into a way to avoid
	 * the pointless lock roundtrip.
	 */
370 371 372 373 374
	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
		error = file_update_time(file);
		if (error)
			return error;
	}
C
Christoph Hellwig 已提交
375

376 377 378 379 380
	/*
	 * If we're writing the file then make sure to clear the setuid and
	 * setgid bits if the process is not being run by root.  This keeps
	 * people from modifying setuid and setgid binaries.
	 */
381 382 383
	if (!IS_NOSEC(inode))
		return file_remove_privs(file);
	return 0;
384 385
}

C
Christoph Hellwig 已提交
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404
static int
xfs_dio_write_end_io(
	struct kiocb		*iocb,
	ssize_t			size,
	unsigned		flags)
{
	struct inode		*inode = file_inode(iocb->ki_filp);
	struct xfs_inode	*ip = XFS_I(inode);
	loff_t			offset = iocb->ki_pos;
	int			error = 0;

	trace_xfs_end_io_direct_write(ip, offset, size);

	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;

	if (size <= 0)
		return size;

405 406 407 408 409 410
	/*
	 * Capture amount written on completion as we can't reliably account
	 * for it on submission.
	 */
	XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);

411 412 413 414 415 416 417 418 419 420 421 422 423 424 425
	if (flags & IOMAP_DIO_COW) {
		error = xfs_reflink_end_cow(ip, offset, size);
		if (error)
			return error;
	}

	/*
	 * Unwritten conversion updates the in-core isize after extent
	 * conversion but before updating the on-disk size. Updating isize any
	 * earlier allows a racing dio read to find unwritten extents before
	 * they are converted.
	 */
	if (flags & IOMAP_DIO_UNWRITTEN)
		return xfs_iomap_write_unwritten(ip, offset, size, true);

C
Christoph Hellwig 已提交
426 427 428 429 430 431 432 433 434 435 436 437 438 439
	/*
	 * We need to update the in-core inode size here so that we don't end up
	 * with the on-disk inode size being outside the in-core inode size. We
	 * have no other method of updating EOF for AIO, so always do it here
	 * if necessary.
	 *
	 * We need to lock the test/set EOF update as we can be racing with
	 * other IO completions here to update the EOF. Failing to serialise
	 * here can result in EOF moving backwards and Bad Things Happen when
	 * that occurs.
	 */
	spin_lock(&ip->i_flags_lock);
	if (offset + size > i_size_read(inode)) {
		i_size_write(inode, offset + size);
440
		spin_unlock(&ip->i_flags_lock);
C
Christoph Hellwig 已提交
441
		error = xfs_setfilesize(ip, offset, size);
442 443 444
	} else {
		spin_unlock(&ip->i_flags_lock);
	}
C
Christoph Hellwig 已提交
445 446 447 448

	return error;
}

449 450 451 452
/*
 * xfs_file_dio_aio_write - handle direct IO writes
 *
 * Lock the inode appropriately to prepare for and issue a direct IO write.
453
 * By separating it from the buffered write path we remove all the tricky to
454 455
 * follow locking changes and looping.
 *
456 457 458 459 460 461 462 463 464 465 466 467 468
 * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
 * until we're sure the bytes at the new EOF have been zeroed and/or the cached
 * pages are flushed out.
 *
 * In most cases the direct IO writes will be done holding IOLOCK_SHARED
 * allowing them to be done in parallel with reads and other direct IO writes.
 * However, if the IO is not aligned to filesystem blocks, the direct IO layer
 * needs to do sub-block zeroing and that requires serialisation against other
 * direct IOs to the same block. In this case we need to serialise the
 * submission of the unaligned IOs so that we don't get racing block zeroing in
 * the dio layer.  To avoid the problem with aio, we also need to wait for
 * outstanding IOs to complete so that unwritten extent conversion is completed
 * before we try to map the overlapping block. This is currently implemented by
C
Christoph Hellwig 已提交
469
 * hitting it with a big hammer (i.e. inode_dio_wait()).
470
 *
471 472 473 474 475 476
 * Returns with locks held indicated by @iolock and errors indicated by
 * negative return values.
 */
STATIC ssize_t
xfs_file_dio_aio_write(
	struct kiocb		*iocb,
477
	struct iov_iter		*from)
478 479 480 481 482 483 484
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	struct xfs_mount	*mp = ip->i_mount;
	ssize_t			ret = 0;
485
	int			unaligned_io = 0;
486
	int			iolock;
487
	size_t			count = iov_iter_count(from);
C
Christoph Hellwig 已提交
488
	struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
489 490
					mp->m_rtdev_targp : mp->m_ddev_targp;

491
	/* DIO must be aligned to device logical sector size */
492
	if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
E
Eric Sandeen 已提交
493
		return -EINVAL;
494

495
	/*
496 497 498 499 500
	 * Don't take the exclusive iolock here unless the I/O is unaligned to
	 * the file system block size.  We don't need to consider the EOF
	 * extension case here because xfs_file_aio_write_checks() will relock
	 * the inode as necessary for EOF zeroing cases and fill out the new
	 * inode size as appropriate.
501
	 */
502 503 504
	if ((iocb->ki_pos & mp->m_blockmask) ||
	    ((iocb->ki_pos + count) & mp->m_blockmask)) {
		unaligned_io = 1;
505 506 507 508 509 510 511 512 513

		/*
		 * We can't properly handle unaligned direct I/O to reflink
		 * files yet, as we can't unshare a partial block.
		 */
		if (xfs_is_reflink_inode(ip)) {
			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
			return -EREMCHG;
		}
514
		iolock = XFS_IOLOCK_EXCL;
515
	} else {
516
		iolock = XFS_IOLOCK_SHARED;
517
	}
518

C
Christoph Hellwig 已提交
519 520
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, iolock))
G
Goldwyn Rodrigues 已提交
521
			return -EAGAIN;
C
Christoph Hellwig 已提交
522
	} else {
G
Goldwyn Rodrigues 已提交
523 524
		xfs_ilock(ip, iolock);
	}
525

526
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
527
	if (ret)
528
		goto out;
529
	count = iov_iter_count(from);
530

531 532
	/*
	 * If we are doing unaligned IO, wait for all other IO to drain,
533 534
	 * otherwise demote the lock if we had to take the exclusive lock
	 * for other reasons in xfs_file_aio_write_checks.
535
	 */
G
Goldwyn Rodrigues 已提交
536 537 538 539 540 541 542 543 544
	if (unaligned_io) {
		/* If we are going to wait for other DIO to finish, bail */
		if (iocb->ki_flags & IOCB_NOWAIT) {
			if (atomic_read(&inode->i_dio_count))
				return -EAGAIN;
		} else {
			inode_dio_wait(inode);
		}
	} else if (iolock == XFS_IOLOCK_EXCL) {
545
		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
546
		iolock = XFS_IOLOCK_SHARED;
547 548
	}

C
Christoph Hellwig 已提交
549
	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
C
Christoph Hellwig 已提交
550
	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
551
out:
552
	xfs_iunlock(ip, iolock);
553

554
	/*
555 556
	 * No fallback to buffered IO on errors for XFS, direct IO will either
	 * complete fully or fail.
557
	 */
558 559 560 561
	ASSERT(ret < 0 || ret == count);
	return ret;
}

562
static noinline ssize_t
563 564 565 566
xfs_file_dax_write(
	struct kiocb		*iocb,
	struct iov_iter		*from)
{
567
	struct inode		*inode = iocb->ki_filp->f_mapping->host;
568
	struct xfs_inode	*ip = XFS_I(inode);
569
	int			iolock = XFS_IOLOCK_EXCL;
570 571 572
	ssize_t			ret, error = 0;
	size_t			count;
	loff_t			pos;
573

C
Christoph Hellwig 已提交
574 575
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!xfs_ilock_nowait(ip, iolock))
G
Goldwyn Rodrigues 已提交
576
			return -EAGAIN;
C
Christoph Hellwig 已提交
577
	} else {
G
Goldwyn Rodrigues 已提交
578 579 580
		xfs_ilock(ip, iolock);
	}

581 582 583 584
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
	if (ret)
		goto out;

585 586
	pos = iocb->ki_pos;
	count = iov_iter_count(from);
587

588
	trace_xfs_file_dax_write(ip, count, pos);
589
	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
590 591 592
	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
		i_size_write(inode, iocb->ki_pos);
		error = xfs_setfilesize(ip, pos, ret);
593 594
	}
out:
595
	xfs_iunlock(ip, iolock);
596 597 598 599 600 601 602 603 604 605
	if (error)
		return error;

	if (ret > 0) {
		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);

		/* Handle various SYNC-type writes */
		ret = generic_write_sync(iocb, ret);
	}
	return ret;
606 607
}

608
STATIC ssize_t
609
xfs_file_buffered_aio_write(
610
	struct kiocb		*iocb,
611
	struct iov_iter		*from)
612 613 614 615
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
616
	struct xfs_inode	*ip = XFS_I(inode);
617 618
	ssize_t			ret;
	int			enospc = 0;
619
	int			iolock;
620

621 622 623
	if (iocb->ki_flags & IOCB_NOWAIT)
		return -EOPNOTSUPP;

624 625
write_retry:
	iolock = XFS_IOLOCK_EXCL;
626
	xfs_ilock(ip, iolock);
627

628
	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
629
	if (ret)
630
		goto out;
631 632

	/* We can write back this queue in page reclaim */
633
	current->backing_dev_info = inode_to_bdi(inode);
634

C
Christoph Hellwig 已提交
635
	trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos);
636
	ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops);
637
	if (likely(ret >= 0))
638
		iocb->ki_pos += ret;
639

640
	/*
641 642 643 644 645 646 647
	 * If we hit a space limit, try to free up some lingering preallocated
	 * space before returning an error. In the case of ENOSPC, first try to
	 * write back all dirty inodes to free up some of the excess reserved
	 * metadata space. This reduces the chances that the eofblocks scan
	 * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
	 * also behaves as a filter to prevent too many eofblocks scans from
	 * running at the same time.
648
	 */
649
	if (ret == -EDQUOT && !enospc) {
650
		xfs_iunlock(ip, iolock);
651 652 653
		enospc = xfs_inode_free_quota_eofblocks(ip);
		if (enospc)
			goto write_retry;
654 655 656
		enospc = xfs_inode_free_quota_cowblocks(ip);
		if (enospc)
			goto write_retry;
657
		iolock = 0;
658 659 660
	} else if (ret == -ENOSPC && !enospc) {
		struct xfs_eofblocks eofb = {0};

661
		enospc = 1;
D
Dave Chinner 已提交
662
		xfs_flush_inodes(ip->i_mount);
663 664

		xfs_iunlock(ip, iolock);
665 666
		eofb.eof_flags = XFS_EOF_FLAGS_SYNC;
		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
667
		xfs_icache_free_cowblocks(ip->i_mount, &eofb);
D
Dave Chinner 已提交
668
		goto write_retry;
669
	}
670

671
	current->backing_dev_info = NULL;
672
out:
673 674
	if (iolock)
		xfs_iunlock(ip, iolock);
675 676 677 678 679 680

	if (ret > 0) {
		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
		/* Handle various SYNC-type writes */
		ret = generic_write_sync(iocb, ret);
	}
681 682 683 684
	return ret;
}

STATIC ssize_t
A
Al Viro 已提交
685
xfs_file_write_iter(
686
	struct kiocb		*iocb,
A
Al Viro 已提交
687
	struct iov_iter		*from)
688 689 690 691 692 693
{
	struct file		*file = iocb->ki_filp;
	struct address_space	*mapping = file->f_mapping;
	struct inode		*inode = mapping->host;
	struct xfs_inode	*ip = XFS_I(inode);
	ssize_t			ret;
A
Al Viro 已提交
694
	size_t			ocount = iov_iter_count(from);
695

696
	XFS_STATS_INC(ip->i_mount, xs_write_calls);
697 698 699 700

	if (ocount == 0)
		return 0;

A
Al Viro 已提交
701 702
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return -EIO;
703

704
	if (IS_DAX(inode))
705 706 707
		return xfs_file_dax_write(iocb, from);

	if (iocb->ki_flags & IOCB_DIRECT) {
708 709 710 711 712 713
		/*
		 * Allow a directio write to fall back to a buffered
		 * write *only* in the case that we're doing a reflink
		 * CoW.  In all other directio scenarios we do not
		 * allow an operation to fall back to buffered mode.
		 */
A
Al Viro 已提交
714
		ret = xfs_file_dio_aio_write(iocb, from);
715 716
		if (ret != -EREMCHG)
			return ret;
717
	}
718

719
	return xfs_file_buffered_aio_write(iocb, from);
720 721
}

722 723
static void
xfs_wait_dax_page(
724
	struct inode		*inode)
725 726 727 728 729 730 731 732 733 734 735
{
	struct xfs_inode        *ip = XFS_I(inode);

	xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
	schedule();
	xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
}

static int
xfs_break_dax_layouts(
	struct inode		*inode,
736
	bool			*retry)
737 738 739 740 741 742 743 744 745
{
	struct page		*page;

	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));

	page = dax_layout_busy_page(inode->i_mapping);
	if (!page)
		return 0;

746
	*retry = true;
747 748
	return ___wait_var_event(&page->_refcount,
			atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
749
			0, 0, xfs_wait_dax_page(inode));
750 751
}

752 753 754 755 756 757 758
int
xfs_break_layouts(
	struct inode		*inode,
	uint			*iolock,
	enum layout_break_reason reason)
{
	bool			retry;
759
	int			error;
760 761 762

	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));

763 764 765 766
	do {
		retry = false;
		switch (reason) {
		case BREAK_UNMAP:
767
			error = xfs_break_dax_layouts(inode, &retry);
768 769 770 771 772 773 774 775 776 777 778 779 780
			if (error || retry)
				break;
			/* fall through */
		case BREAK_WRITE:
			error = xfs_break_leased_layouts(inode, iolock, &retry);
			break;
		default:
			WARN_ON_ONCE(1);
			error = -EINVAL;
		}
	} while (error == 0 && retry);

	return error;
781 782
}

783 784 785
#define	XFS_FALLOC_FL_SUPPORTED						\
		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
786
		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
787

788 789
STATIC long
xfs_file_fallocate(
790 791 792 793
	struct file		*file,
	int			mode,
	loff_t			offset,
	loff_t			len)
794
{
795 796 797
	struct inode		*inode = file_inode(file);
	struct xfs_inode	*ip = XFS_I(inode);
	long			error;
798
	enum xfs_prealloc_flags	flags = 0;
799
	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
800
	loff_t			new_size = 0;
801
	bool			do_file_insert = false;
802

803 804
	if (!S_ISREG(inode->i_mode))
		return -EINVAL;
805
	if (mode & ~XFS_FALLOC_FL_SUPPORTED)
806 807
		return -EOPNOTSUPP;

808
	xfs_ilock(ip, iolock);
809
	error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
810 811 812
	if (error)
		goto out_unlock;

813 814 815 816
	if (mode & FALLOC_FL_PUNCH_HOLE) {
		error = xfs_free_file_space(ip, offset, len);
		if (error)
			goto out_unlock;
817
	} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
F
Fabian Frederick 已提交
818
		unsigned int blksize_mask = i_blocksize(inode) - 1;
819 820

		if (offset & blksize_mask || len & blksize_mask) {
D
Dave Chinner 已提交
821
			error = -EINVAL;
822 823 824
			goto out_unlock;
		}

825 826 827 828 829
		/*
		 * There is no need to overlap collapse range with EOF,
		 * in which case it is effectively a truncate operation
		 */
		if (offset + len >= i_size_read(inode)) {
D
Dave Chinner 已提交
830
			error = -EINVAL;
831 832 833
			goto out_unlock;
		}

834 835 836 837 838
		new_size = i_size_read(inode) - len;

		error = xfs_collapse_file_space(ip, offset, len);
		if (error)
			goto out_unlock;
839
	} else if (mode & FALLOC_FL_INSERT_RANGE) {
840 841
		unsigned int	blksize_mask = i_blocksize(inode) - 1;
		loff_t		isize = i_size_read(inode);
842 843 844 845 846 847

		if (offset & blksize_mask || len & blksize_mask) {
			error = -EINVAL;
			goto out_unlock;
		}

848 849 850 851 852
		/*
		 * New inode size must not exceed ->s_maxbytes, accounting for
		 * possible signed overflow.
		 */
		if (inode->i_sb->s_maxbytes - isize < len) {
853 854 855
			error = -EFBIG;
			goto out_unlock;
		}
856
		new_size = isize + len;
857 858

		/* Offset should be less than i_size */
859
		if (offset >= isize) {
860 861 862
			error = -EINVAL;
			goto out_unlock;
		}
863
		do_file_insert = true;
864
	} else {
865 866
		flags |= XFS_PREALLOC_SET;

867 868 869
		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
		    offset + len > i_size_read(inode)) {
			new_size = offset + len;
D
Dave Chinner 已提交
870
			error = inode_newsize_ok(inode, new_size);
871 872 873
			if (error)
				goto out_unlock;
		}
874

875 876
		if (mode & FALLOC_FL_ZERO_RANGE)
			error = xfs_zero_file_space(ip, offset, len);
877 878 879 880 881 882
		else {
			if (mode & FALLOC_FL_UNSHARE_RANGE) {
				error = xfs_reflink_unshare(ip, offset, len);
				if (error)
					goto out_unlock;
			}
883 884
			error = xfs_alloc_file_space(ip, offset, len,
						     XFS_BMAPI_PREALLOC);
885
		}
886 887 888 889
		if (error)
			goto out_unlock;
	}

890
	if (file->f_flags & O_DSYNC)
891 892 893
		flags |= XFS_PREALLOC_SYNC;

	error = xfs_update_prealloc_flags(ip, flags);
894 895 896 897 898 899 900 901 902
	if (error)
		goto out_unlock;

	/* Change file size if needed */
	if (new_size) {
		struct iattr iattr;

		iattr.ia_valid = ATTR_SIZE;
		iattr.ia_size = new_size;
903
		error = xfs_vn_setattr_size(file_dentry(file), &iattr);
904 905
		if (error)
			goto out_unlock;
906 907
	}

908 909 910 911 912 913 914 915 916
	/*
	 * Perform hole insertion now that the file size has been
	 * updated so that if we crash during the operation we don't
	 * leave shifted extents past EOF and hence losing access to
	 * the data that is contained within them.
	 */
	if (do_file_insert)
		error = xfs_insert_file_space(ip, offset, len);

917
out_unlock:
918
	xfs_iunlock(ip, iolock);
D
Dave Chinner 已提交
919
	return error;
920 921
}

922
STATIC loff_t
923
xfs_file_remap_range(
924 925 926 927
	struct file	*file_in,
	loff_t		pos_in,
	struct file	*file_out,
	loff_t		pos_out,
928
	loff_t		len,
929
	unsigned int	remap_flags)
930
{
931 932
	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
		return -EINVAL;
933

934
	return xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
935
			len, remap_flags);
936
}
937

L
Linus Torvalds 已提交
938
STATIC int
939
xfs_file_open(
L
Linus Torvalds 已提交
940
	struct inode	*inode,
941
	struct file	*file)
L
Linus Torvalds 已提交
942
{
943
	if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
L
Linus Torvalds 已提交
944
		return -EFBIG;
945 946
	if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
		return -EIO;
947
	file->f_mode |= FMODE_NOWAIT;
948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
	return 0;
}

STATIC int
xfs_dir_open(
	struct inode	*inode,
	struct file	*file)
{
	struct xfs_inode *ip = XFS_I(inode);
	int		mode;
	int		error;

	error = xfs_file_open(inode, file);
	if (error)
		return error;

	/*
	 * If there are any blocks, read-ahead block 0 as we're almost
	 * certain to have the next operation be a read there.
	 */
968
	mode = xfs_ilock_data_map_shared(ip);
969
	if (ip->i_d.di_nextents > 0)
970
		error = xfs_dir3_data_readahead(ip, 0, -1);
971
	xfs_iunlock(ip, mode);
972
	return error;
L
Linus Torvalds 已提交
973 974 975
}

STATIC int
976
xfs_file_release(
L
Linus Torvalds 已提交
977 978 979
	struct inode	*inode,
	struct file	*filp)
{
D
Dave Chinner 已提交
980
	return xfs_release(XFS_I(inode));
L
Linus Torvalds 已提交
981 982 983
}

STATIC int
984
xfs_file_readdir(
A
Al Viro 已提交
985 986
	struct file	*file,
	struct dir_context *ctx)
L
Linus Torvalds 已提交
987
{
A
Al Viro 已提交
988
	struct inode	*inode = file_inode(file);
989
	xfs_inode_t	*ip = XFS_I(inode);
C
Christoph Hellwig 已提交
990 991 992 993 994 995 996 997 998 999 1000 1001
	size_t		bufsize;

	/*
	 * The Linux API doesn't pass down the total size of the buffer
	 * we read into down to the filesystem.  With the filldir concept
	 * it's not needed for correct information, but the XFS dir2 leaf
	 * code wants an estimate of the buffer size to calculate it's
	 * readahead window and size the buffers used for mapping to
	 * physical blocks.
	 *
	 * Try to give it an estimate that's good enough, maybe at some
	 * point we can change the ->readdir prototype to include the
E
Eric Sandeen 已提交
1002
	 * buffer size.  For now we use the current glibc buffer size.
C
Christoph Hellwig 已提交
1003
	 */
D
Darrick J. Wong 已提交
1004
	bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_d.di_size);
C
Christoph Hellwig 已提交
1005

1006
	return xfs_readdir(NULL, ip, ctx, bufsize);
1007 1008 1009 1010 1011 1012
}

STATIC loff_t
xfs_file_llseek(
	struct file	*file,
	loff_t		offset,
1013
	int		whence)
1014
{
1015 1016 1017 1018 1019
	struct inode		*inode = file->f_mapping->host;

	if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount))
		return -EIO;

1020
	switch (whence) {
1021
	default:
1022
		return generic_file_llseek(file, offset, whence);
1023
	case SEEK_HOLE:
1024 1025
		offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops);
		break;
1026
	case SEEK_DATA:
1027 1028
		offset = iomap_seek_data(inode, offset, &xfs_iomap_ops);
		break;
1029
	}
1030 1031 1032 1033

	if (offset < 0)
		return offset;
	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
1034 1035
}

1036 1037 1038 1039 1040
/*
 * Locking for serialisation of IO during page faults. This results in a lock
 * ordering of:
 *
 * mmap_sem (MM)
1041
 *   sb_start_pagefault(vfs, freeze)
1042
 *     i_mmaplock (XFS - truncate serialisation)
1043 1044
 *       page_lock (MM)
 *         i_lock (XFS - extent map serialisation)
1045
 */
1046
static vm_fault_t
1047 1048 1049 1050
__xfs_filemap_fault(
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size,
	bool			write_fault)
1051
{
1052
	struct inode		*inode = file_inode(vmf->vma->vm_file);
1053
	struct xfs_inode	*ip = XFS_I(inode);
1054
	vm_fault_t		ret;
1055

1056
	trace_xfs_filemap_fault(ip, pe_size, write_fault);
1057

1058 1059 1060 1061
	if (write_fault) {
		sb_start_pagefault(inode->i_sb);
		file_update_time(vmf->vma->vm_file);
	}
1062

1063
	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
1064
	if (IS_DAX(inode)) {
1065 1066
		pfn_t pfn;

1067
		ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, &xfs_iomap_ops);
1068 1069
		if (ret & VM_FAULT_NEEDDSYNC)
			ret = dax_finish_sync_fault(vmf, pe_size, pfn);
1070
	} else {
1071 1072 1073 1074
		if (write_fault)
			ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
		else
			ret = filemap_fault(vmf);
1075 1076 1077
	}
	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);

1078 1079
	if (write_fault)
		sb_end_pagefault(inode->i_sb);
1080
	return ret;
1081 1082
}

1083
static vm_fault_t
1084
xfs_filemap_fault(
1085 1086
	struct vm_fault		*vmf)
{
1087
	/* DAX can shortcut the normal fault path on write faults! */
1088 1089 1090
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
			IS_DAX(file_inode(vmf->vma->vm_file)) &&
			(vmf->flags & FAULT_FLAG_WRITE));
1091 1092
}

1093
static vm_fault_t
1094
xfs_filemap_huge_fault(
1095 1096
	struct vm_fault		*vmf,
	enum page_entry_size	pe_size)
M
Matthew Wilcox 已提交
1097
{
1098
	if (!IS_DAX(file_inode(vmf->vma->vm_file)))
M
Matthew Wilcox 已提交
1099 1100
		return VM_FAULT_FALLBACK;

1101 1102 1103 1104
	/* DAX can shortcut the normal fault path on write faults! */
	return __xfs_filemap_fault(vmf, pe_size,
			(vmf->flags & FAULT_FLAG_WRITE));
}
M
Matthew Wilcox 已提交
1105

1106
static vm_fault_t
1107 1108 1109 1110
xfs_filemap_page_mkwrite(
	struct vm_fault		*vmf)
{
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
M
Matthew Wilcox 已提交
1111 1112
}

1113
/*
1114 1115 1116
 * pfn_mkwrite was originally intended to ensure we capture time stamp updates
 * on write faults. In reality, it needs to serialise against truncate and
 * prepare memory for writing so handle is as standard write fault.
1117
 */
1118
static vm_fault_t
1119 1120 1121 1122
xfs_filemap_pfn_mkwrite(
	struct vm_fault		*vmf)
{

1123
	return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
M
Matthew Wilcox 已提交
1124 1125
}

1126 1127
static const struct vm_operations_struct xfs_file_vm_ops = {
	.fault		= xfs_filemap_fault,
1128
	.huge_fault	= xfs_filemap_huge_fault,
1129 1130
	.map_pages	= filemap_map_pages,
	.page_mkwrite	= xfs_filemap_page_mkwrite,
1131
	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
1132 1133 1134 1135 1136 1137 1138
};

STATIC int
xfs_file_mmap(
	struct file	*filp,
	struct vm_area_struct *vma)
{
1139 1140 1141 1142 1143 1144 1145
	/*
	 * We don't support synchronous mappings for non-DAX files. At least
	 * until someone comes with a sensible use case.
	 */
	if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
		return -EOPNOTSUPP;

1146 1147 1148
	file_accessed(filp);
	vma->vm_ops = &xfs_file_vm_ops;
	if (IS_DAX(file_inode(filp)))
1149
		vma->vm_flags |= VM_HUGEPAGE;
1150
	return 0;
1151 1152
}

1153
const struct file_operations xfs_file_operations = {
1154
	.llseek		= xfs_file_llseek,
A
Al Viro 已提交
1155
	.read_iter	= xfs_file_read_iter,
A
Al Viro 已提交
1156
	.write_iter	= xfs_file_write_iter,
1157
	.splice_read	= generic_file_splice_read,
A
Al Viro 已提交
1158
	.splice_write	= iter_file_splice_write,
1159
	.unlocked_ioctl	= xfs_file_ioctl,
L
Linus Torvalds 已提交
1160
#ifdef CONFIG_COMPAT
1161
	.compat_ioctl	= xfs_file_compat_ioctl,
L
Linus Torvalds 已提交
1162
#endif
1163
	.mmap		= xfs_file_mmap,
1164
	.mmap_supported_flags = MAP_SYNC,
1165 1166 1167
	.open		= xfs_file_open,
	.release	= xfs_file_release,
	.fsync		= xfs_file_fsync,
1168
	.get_unmapped_area = thp_get_unmapped_area,
1169
	.fallocate	= xfs_file_fallocate,
1170
	.remap_file_range = xfs_file_remap_range,
L
Linus Torvalds 已提交
1171 1172
};

1173
const struct file_operations xfs_dir_file_operations = {
1174
	.open		= xfs_dir_open,
L
Linus Torvalds 已提交
1175
	.read		= generic_read_dir,
1176
	.iterate_shared	= xfs_file_readdir,
1177
	.llseek		= generic_file_llseek,
1178
	.unlocked_ioctl	= xfs_file_ioctl,
1179
#ifdef CONFIG_COMPAT
1180
	.compat_ioctl	= xfs_file_compat_ioctl,
1181
#endif
1182
	.fsync		= xfs_dir_fsync,
L
Linus Torvalds 已提交
1183
};