xfs_icache.c 55.7 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 */
#include "xfs.h"
#include "xfs_fs.h"
8
#include "xfs_shared.h"
9
#include "xfs_format.h"
10 11
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
12 13 14
#include "xfs_sb.h"
#include "xfs_mount.h"
#include "xfs_inode.h"
15 16
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
17
#include "xfs_inode_item.h"
C
Christoph Hellwig 已提交
18
#include "xfs_quota.h"
C
Christoph Hellwig 已提交
19
#include "xfs_trace.h"
20
#include "xfs_icache.h"
D
Dave Chinner 已提交
21
#include "xfs_bmap_util.h"
22 23
#include "xfs_dquot_item.h"
#include "xfs_dquot.h"
24
#include "xfs_reflink.h"
C
Christoph Hellwig 已提交
25
#include "xfs_ialloc.h"
26

J
Jeff Layton 已提交
27
#include <linux/iversion.h>
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42
/* Radix tree tags for incore inode tree. */

/* inode is to be reclaimed */
#define XFS_ICI_RECLAIM_TAG	0
/* Inode has speculative preallocations (posteof or cow) to clean. */
#define XFS_ICI_BLOCKGC_TAG	1

/*
 * The goal for walking incore inodes.  These can correspond with incore inode
 * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
 */
enum xfs_icwalk_goal {
	/* Goals directly associated with tagged inodes. */
	XFS_ICWALK_BLOCKGC	= XFS_ICI_BLOCKGC_TAG,
43
	XFS_ICWALK_RECLAIM	= XFS_ICI_RECLAIM_TAG,
44 45 46 47 48 49 50 51 52 53 54
};

#define XFS_ICWALK_NULL_TAG	(-1U)

/* Compute the inode radix tree tag for this goal. */
static inline unsigned int
xfs_icwalk_tag(enum xfs_icwalk_goal goal)
{
	return goal < 0 ? XFS_ICWALK_NULL_TAG : goal;
}

55
static int xfs_icwalk(struct xfs_mount *mp,
56
		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
57
static int xfs_icwalk_ag(struct xfs_perag *pag,
58
		enum xfs_icwalk_goal goal, struct xfs_icwalk *icw);
59

60
/*
61 62
 * Private inode cache walk flags for struct xfs_icwalk.  Must not
 * coincide with XFS_ICWALK_FLAGS_VALID.
63 64
 */

65 66 67
/* Stop scanning after icw_scan_limit inodes. */
#define XFS_ICWALK_FLAG_SCAN_LIMIT	(1U << 28)

68
#define XFS_ICWALK_FLAG_RECLAIM_SICK	(1U << 27)
69
#define XFS_ICWALK_FLAG_UNION		(1U << 26) /* union filter algorithm */
70

71
#define XFS_ICWALK_PRIVATE_FLAGS	(XFS_ICWALK_FLAG_SCAN_LIMIT | \
72 73
					 XFS_ICWALK_FLAG_RECLAIM_SICK | \
					 XFS_ICWALK_FLAG_UNION)
74

D
Dave Chinner 已提交
75 76 77
/*
 * Allocate and initialise an xfs_inode.
 */
78
struct xfs_inode *
D
Dave Chinner 已提交
79 80 81 82 83 84 85
xfs_inode_alloc(
	struct xfs_mount	*mp,
	xfs_ino_t		ino)
{
	struct xfs_inode	*ip;

	/*
86 87
	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
	 * and return NULL here on ENOMEM.
D
Dave Chinner 已提交
88
	 */
89 90
	ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);

D
Dave Chinner 已提交
91
	if (inode_init_always(mp->m_super, VFS_I(ip))) {
92
		kmem_cache_free(xfs_inode_zone, ip);
D
Dave Chinner 已提交
93 94 95
		return NULL;
	}

D
Dave Chinner 已提交
96
	/* VFS doesn't initialise i_mode or i_state! */
D
Dave Chinner 已提交
97
	VFS_I(ip)->i_mode = 0;
D
Dave Chinner 已提交
98
	VFS_I(ip)->i_state = 0;
D
Dave Chinner 已提交
99

100
	XFS_STATS_INC(mp, vn_active);
D
Dave Chinner 已提交
101 102 103 104 105 106 107
	ASSERT(atomic_read(&ip->i_pincount) == 0);
	ASSERT(ip->i_ino == 0);

	/* initialise the xfs inode */
	ip->i_ino = ino;
	ip->i_mount = mp;
	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
D
Darrick J. Wong 已提交
108
	ip->i_cowfp = NULL;
109 110
	memset(&ip->i_af, 0, sizeof(ip->i_af));
	ip->i_af.if_format = XFS_DINODE_FMT_EXTENTS;
111
	memset(&ip->i_df, 0, sizeof(ip->i_df));
D
Dave Chinner 已提交
112 113
	ip->i_flags = 0;
	ip->i_delayed_blks = 0;
114
	memset(&ip->i_d, 0, sizeof(ip->i_d));
115 116
	ip->i_sick = 0;
	ip->i_checked = 0;
117 118 119
	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
	INIT_LIST_HEAD(&ip->i_ioend_list);
	spin_lock_init(&ip->i_ioend_lock);
D
Dave Chinner 已提交
120 121 122 123 124 125 126 127 128 129 130

	return ip;
}

STATIC void
xfs_inode_free_callback(
	struct rcu_head		*head)
{
	struct inode		*inode = container_of(head, struct inode, i_rcu);
	struct xfs_inode	*ip = XFS_I(inode);

D
Dave Chinner 已提交
131
	switch (VFS_I(ip)->i_mode & S_IFMT) {
D
Dave Chinner 已提交
132 133 134
	case S_IFREG:
	case S_IFDIR:
	case S_IFLNK:
135
		xfs_idestroy_fork(&ip->i_df);
D
Dave Chinner 已提交
136 137 138
		break;
	}

139 140
	xfs_ifork_zap_attr(ip);

141 142 143 144
	if (ip->i_cowfp) {
		xfs_idestroy_fork(ip->i_cowfp);
		kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
	}
D
Dave Chinner 已提交
145
	if (ip->i_itemp) {
D
Dave Chinner 已提交
146 147
		ASSERT(!test_bit(XFS_LI_IN_AIL,
				 &ip->i_itemp->ili_item.li_flags));
D
Dave Chinner 已提交
148 149 150 151
		xfs_inode_item_destroy(ip);
		ip->i_itemp = NULL;
	}

152
	kmem_cache_free(xfs_inode_zone, ip);
153 154
}

155 156 157 158 159 160
static void
__xfs_inode_free(
	struct xfs_inode	*ip)
{
	/* asserts to verify all state is correct here */
	ASSERT(atomic_read(&ip->i_pincount) == 0);
161
	ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
162 163 164 165 166
	XFS_STATS_DEC(ip->i_mount, vn_active);

	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}

167 168 169 170
void
xfs_inode_free(
	struct xfs_inode	*ip)
{
171
	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
172

D
Dave Chinner 已提交
173 174 175 176 177 178 179 180 181 182 183
	/*
	 * Because we use RCU freeing we need to ensure the inode always
	 * appears to be reclaimed with an invalid inode number when in the
	 * free state. The ip->i_flags_lock provides the barrier against lookup
	 * races.
	 */
	spin_lock(&ip->i_flags_lock);
	ip->i_flags = XFS_IRECLAIM;
	ip->i_ino = 0;
	spin_unlock(&ip->i_flags_lock);

184
	__xfs_inode_free(ip);
D
Dave Chinner 已提交
185 186
}

187
/*
188 189
 * Queue background inode reclaim work if there are reclaimable inodes and there
 * isn't reclaim work already scheduled or in progress.
190 191 192 193 194 195 196 197 198 199 200 201 202 203
 */
static void
xfs_reclaim_work_queue(
	struct xfs_mount        *mp)
{

	rcu_read_lock();
	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
	}
	rcu_read_unlock();
}

204 205 206 207 208 209
/*
 * Background scanning to trim preallocated space. This is queued based on the
 * 'speculative_prealloc_lifetime' tunable (5m by default).
 */
static inline void
xfs_blockgc_queue(
210
	struct xfs_perag	*pag)
211
{
212 213 214 215 216
	struct xfs_mount	*mp = pag->pag_mount;

	if (!xfs_is_blockgc_enabled(mp))
		return;

217 218
	rcu_read_lock();
	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
219
		queue_delayed_work(pag->pag_mount->m_blockgc_wq,
220 221 222 223 224 225 226 227 228 229 230
				   &pag->pag_blockgc_work,
				   msecs_to_jiffies(xfs_blockgc_secs * 1000));
	rcu_read_unlock();
}

/* Set a tag on both the AG incore inode tree and the AG radix tree. */
static void
xfs_perag_set_inode_tag(
	struct xfs_perag	*pag,
	xfs_agino_t		agino,
	unsigned int		tag)
231 232
{
	struct xfs_mount	*mp = pag->pag_mount;
233
	bool			was_tagged;
234

235
	lockdep_assert_held(&pag->pag_ici_lock);
236 237 238 239 240 241 242 243

	was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
	radix_tree_tag_set(&pag->pag_ici_root, agino, tag);

	if (tag == XFS_ICI_RECLAIM_TAG)
		pag->pag_ici_reclaimable++;

	if (was_tagged)
244 245
		return;

246
	/* propagate the tag up into the perag radix tree */
247
	spin_lock(&mp->m_perag_lock);
248
	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag);
249 250
	spin_unlock(&mp->m_perag_lock);

251 252 253 254 255 256 257 258 259
	/* start background work */
	switch (tag) {
	case XFS_ICI_RECLAIM_TAG:
		xfs_reclaim_work_queue(mp);
		break;
	case XFS_ICI_BLOCKGC_TAG:
		xfs_blockgc_queue(pag);
		break;
	}
260

261
	trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
262 263
}

264
/* Clear a tag on both the AG incore inode tree and the AG radix tree. */
265
static void
266 267 268 269
xfs_perag_clear_inode_tag(
	struct xfs_perag	*pag,
	xfs_agino_t		agino,
	unsigned int		tag)
270 271 272
{
	struct xfs_mount	*mp = pag->pag_mount;

273
	lockdep_assert_held(&pag->pag_ici_lock);
274 275 276 277 278 279 280 281 282 283 284 285 286 287

	/*
	 * Reclaim can signal (with a null agino) that it cleared its own tag
	 * by removing the inode from the radix tree.
	 */
	if (agino != NULLAGINO)
		radix_tree_tag_clear(&pag->pag_ici_root, agino, tag);
	else
		ASSERT(tag == XFS_ICI_RECLAIM_TAG);

	if (tag == XFS_ICI_RECLAIM_TAG)
		pag->pag_ici_reclaimable--;

	if (radix_tree_tagged(&pag->pag_ici_root, tag))
288 289
		return;

290
	/* clear the tag from the perag radix tree */
291
	spin_lock(&mp->m_perag_lock);
292
	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag);
293 294
	spin_unlock(&mp->m_perag_lock);

295 296
	trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_);
}
297

298 299 300 301
/*
 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
 * part of the structure. This is made more complex by the fact we store
 * information about the on-disk values in the VFS inode and so we can't just
302
 * overwrite the values unconditionally. Hence we save the parameters we
303
 * need to retain across reinitialisation, and rewrite them into the VFS inode
304
 * after reinitialisation even if it fails.
305 306 307 308 309 310
 */
static int
xfs_reinit_inode(
	struct xfs_mount	*mp,
	struct inode		*inode)
{
311 312 313 314 315 316 317 318
	int			error;
	uint32_t		nlink = inode->i_nlink;
	uint32_t		generation = inode->i_generation;
	uint64_t		version = inode_peek_iversion(inode);
	umode_t			mode = inode->i_mode;
	dev_t			dev = inode->i_rdev;
	kuid_t			uid = inode->i_uid;
	kgid_t			gid = inode->i_gid;
319 320 321

	error = inode_init_always(mp->m_super, inode);

322
	set_nlink(inode, nlink);
323
	inode->i_generation = generation;
J
Jeff Layton 已提交
324
	inode_set_iversion_queried(inode, version);
D
Dave Chinner 已提交
325
	inode->i_mode = mode;
326
	inode->i_rdev = dev;
327 328
	inode->i_uid = uid;
	inode->i_gid = gid;
329 330 331
	return error;
}

332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
/*
 * Carefully nudge an inode whose VFS state has been torn down back into a
 * usable state.  Drops the i_flags_lock and the rcu read lock.
 */
static int
xfs_iget_recycle(
	struct xfs_perag	*pag,
	struct xfs_inode	*ip) __releases(&ip->i_flags_lock)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct inode		*inode = VFS_I(ip);
	int			error;

	trace_xfs_iget_recycle(ip);

347 348 349
	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
		return -EAGAIN;

350 351 352 353 354 355 356 357 358 359 360 361 362
	/*
	 * We need to make it look like the inode is being reclaimed to prevent
	 * the actual reclaim workers from stomping over us while we recycle
	 * the inode.  We can't clear the radix tree tag yet as it requires
	 * pag_ici_lock to be held exclusive.
	 */
	ip->i_flags |= XFS_IRECLAIM;

	spin_unlock(&ip->i_flags_lock);
	rcu_read_unlock();

	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
	error = xfs_reinit_inode(mp, inode);
363
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
	if (error) {
		/*
		 * Re-initializing the inode failed, and we are in deep
		 * trouble.  Try to re-add it to the reclaim list.
		 */
		rcu_read_lock();
		spin_lock(&ip->i_flags_lock);
		ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
		ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
		spin_unlock(&ip->i_flags_lock);
		rcu_read_unlock();

		trace_xfs_iget_recycle_fail(ip);
		return error;
	}

	spin_lock(&pag->pag_ici_lock);
	spin_lock(&ip->i_flags_lock);

	/*
	 * Clear the per-lifetime state in the inode as we are now effectively
	 * a new inode and need to return to the initial state before reuse
	 * occurs.
	 */
	ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
	ip->i_flags |= XFS_INEW;
	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
			XFS_ICI_RECLAIM_TAG);
	inode->i_state = I_NEW;
	spin_unlock(&ip->i_flags_lock);
	spin_unlock(&pag->pag_ici_lock);

	return 0;
}

399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438
/*
 * If we are allocating a new inode, then check what was returned is
 * actually a free, empty inode. If we are not allocating an inode,
 * then check we didn't find a free inode.
 *
 * Returns:
 *	0		if the inode free state matches the lookup context
 *	-ENOENT		if the inode is free and we are not allocating
 *	-EFSCORRUPTED	if there is any state mismatch at all
 */
static int
xfs_iget_check_free_state(
	struct xfs_inode	*ip,
	int			flags)
{
	if (flags & XFS_IGET_CREATE) {
		/* should be a free inode */
		if (VFS_I(ip)->i_mode != 0) {
			xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
				ip->i_ino, VFS_I(ip)->i_mode);
			return -EFSCORRUPTED;
		}

		if (ip->i_d.di_nblocks != 0) {
			xfs_warn(ip->i_mount,
"Corruption detected! Free inode 0x%llx has blocks allocated!",
				ip->i_ino);
			return -EFSCORRUPTED;
		}
		return 0;
	}

	/* should be an allocated inode */
	if (VFS_I(ip)->i_mode == 0)
		return -ENOENT;

	return 0;
}

439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
/* Make all pending inactivation work start immediately. */
static void
xfs_inodegc_queue_all(
	struct xfs_mount	*mp)
{
	struct xfs_inodegc	*gc;
	int			cpu;

	for_each_online_cpu(cpu) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list))
			queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
	}
}

D
Dave Chinner 已提交
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476
/*
 * Check the validity of the inode we just found it the cache
 */
static int
xfs_iget_cache_hit(
	struct xfs_perag	*pag,
	struct xfs_inode	*ip,
	xfs_ino_t		ino,
	int			flags,
	int			lock_flags) __releases(RCU)
{
	struct inode		*inode = VFS_I(ip);
	struct xfs_mount	*mp = ip->i_mount;
	int			error;

	/*
	 * check for re-use of an inode within an RCU grace period due to the
	 * radix tree nodes not being updated yet. We monitor for this by
	 * setting the inode number to zero before freeing the inode structure.
	 * If the inode has been reallocated and set up, then the inode number
	 * will not match, so check for that, too.
	 */
	spin_lock(&ip->i_flags_lock);
477 478
	if (ip->i_ino != ino)
		goto out_skip;
D
Dave Chinner 已提交
479 480 481 482

	/*
	 * If we are racing with another cache hit that is currently
	 * instantiating this inode or currently recycling it out of
483
	 * reclaimable state, wait for the initialisation to complete
D
Dave Chinner 已提交
484 485
	 * before continuing.
	 *
486 487 488 489 490 491 492 493
	 * If we're racing with the inactivation worker we also want to wait.
	 * If we're creating a new file, it's possible that the worker
	 * previously marked the inode as free on disk but hasn't finished
	 * updating the incore state yet.  The AGI buffer will be dirty and
	 * locked to the icreate transaction, so a synchronous push of the
	 * inodegc workers would result in deadlock.  For a regular iget, the
	 * worker is running already, so we might as well wait.
	 *
D
Dave Chinner 已提交
494 495 496 497
	 * XXX(hch): eventually we should do something equivalent to
	 *	     wait_on_inode to wait for these flags to be cleared
	 *	     instead of polling for it.
	 */
498
	if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING))
499
		goto out_skip;
D
Dave Chinner 已提交
500

501 502 503 504 505 506 507 508 509
	if (ip->i_flags & XFS_NEED_INACTIVE) {
		/* Unlinked inodes cannot be re-grabbed. */
		if (VFS_I(ip)->i_nlink == 0) {
			error = -ENOENT;
			goto out_error;
		}
		goto out_inodegc_flush;
	}

D
Dave Chinner 已提交
510
	/*
511 512
	 * Check the inode free state is valid. This also detects lookup
	 * racing with unlinks.
D
Dave Chinner 已提交
513
	 */
514 515
	error = xfs_iget_check_free_state(ip, flags);
	if (error)
D
Dave Chinner 已提交
516 517
		goto out_error;

518 519 520 521
	/* Skip inodes that have no vfs state. */
	if ((flags & XFS_IGET_INCORE) &&
	    (ip->i_flags & XFS_IRECLAIMABLE))
		goto out_skip;
522

523 524
	/* The inode fits the selection criteria; process it. */
	if (ip->i_flags & XFS_IRECLAIMABLE) {
525 526
		/* Drops i_flags_lock and RCU read lock. */
		error = xfs_iget_recycle(pag, ip);
527 528
		if (error == -EAGAIN)
			goto out_skip;
529 530
		if (error)
			return error;
D
Dave Chinner 已提交
531 532
	} else {
		/* If the VFS inode is being torn down, pause and try again. */
533 534
		if (!igrab(inode))
			goto out_skip;
D
Dave Chinner 已提交
535 536 537 538 539 540 541 542 543 544

		/* We've got a live one. */
		spin_unlock(&ip->i_flags_lock);
		rcu_read_unlock();
		trace_xfs_iget_hit(ip);
	}

	if (lock_flags != 0)
		xfs_ilock(ip, lock_flags);

545
	if (!(flags & XFS_IGET_INCORE))
546
		xfs_iflags_clear(ip, XFS_ISTALE);
547
	XFS_STATS_INC(mp, xs_ig_found);
D
Dave Chinner 已提交
548 549 550

	return 0;

551 552 553 554
out_skip:
	trace_xfs_iget_skip(ip);
	XFS_STATS_INC(mp, xs_ig_frecycle);
	error = -EAGAIN;
D
Dave Chinner 已提交
555 556 557 558
out_error:
	spin_unlock(&ip->i_flags_lock);
	rcu_read_unlock();
	return error;
559 560 561 562 563 564 565 566 567 568 569

out_inodegc_flush:
	spin_unlock(&ip->i_flags_lock);
	rcu_read_unlock();
	/*
	 * Do not wait for the workers, because the caller could hold an AGI
	 * buffer lock.  We're just going to sleep in a loop anyway.
	 */
	if (xfs_is_inodegc_enabled(mp))
		xfs_inodegc_queue_all(mp);
	return -EAGAIN;
D
Dave Chinner 已提交
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588
}

static int
xfs_iget_cache_miss(
	struct xfs_mount	*mp,
	struct xfs_perag	*pag,
	xfs_trans_t		*tp,
	xfs_ino_t		ino,
	struct xfs_inode	**ipp,
	int			flags,
	int			lock_flags)
{
	struct xfs_inode	*ip;
	int			error;
	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
	int			iflags;

	ip = xfs_inode_alloc(mp, ino);
	if (!ip)
D
Dave Chinner 已提交
589
		return -ENOMEM;
D
Dave Chinner 已提交
590

C
Christoph Hellwig 已提交
591
	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
D
Dave Chinner 已提交
592 593 594
	if (error)
		goto out_destroy;

C
Christoph Hellwig 已提交
595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624
	/*
	 * For version 5 superblocks, if we are initialising a new inode and we
	 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
	 * simply build the new inode core with a random generation number.
	 *
	 * For version 4 (and older) superblocks, log recovery is dependent on
	 * the di_flushiter field being initialised from the current on-disk
	 * value and hence we must also read the inode off disk even when
	 * initializing new inodes.
	 */
	if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
	    (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
		VFS_I(ip)->i_generation = prandom_u32();
	} else {
		struct xfs_dinode	*dip;
		struct xfs_buf		*bp;

		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0);
		if (error)
			goto out_destroy;

		error = xfs_inode_from_disk(ip, dip);
		if (!error)
			xfs_buf_set_ref(bp, XFS_INO_REF);
		xfs_trans_brelse(tp, bp);

		if (error)
			goto out_destroy;
	}

D
Dave Chinner 已提交
625 626
	trace_xfs_iget_miss(ip);

627
	/*
628 629
	 * Check the inode free state is valid. This also detects lookup
	 * racing with unlinks.
630
	 */
631 632
	error = xfs_iget_check_free_state(ip, flags);
	if (error)
D
Dave Chinner 已提交
633 634 635 636 637 638 639 640 641
		goto out_destroy;

	/*
	 * Preload the radix tree so we can insert safely under the
	 * write spinlock. Note that we cannot sleep inside the preload
	 * region. Since we can be called from transaction context, don't
	 * recurse into the file system.
	 */
	if (radix_tree_preload(GFP_NOFS)) {
D
Dave Chinner 已提交
642
		error = -EAGAIN;
D
Dave Chinner 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665
		goto out_destroy;
	}

	/*
	 * Because the inode hasn't been added to the radix-tree yet it can't
	 * be found by another thread, so we can do the non-sleeping lock here.
	 */
	if (lock_flags) {
		if (!xfs_ilock_nowait(ip, lock_flags))
			BUG();
	}

	/*
	 * These values must be set before inserting the inode into the radix
	 * tree as the moment it is inserted a concurrent lookup (allowed by the
	 * RCU locking mechanism) can find it and that lookup must see that this
	 * is an inode currently under construction (i.e. that XFS_INEW is set).
	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
	 * memory barrier that ensures this detection works correctly at lookup
	 * time.
	 */
	iflags = XFS_INEW;
	if (flags & XFS_IGET_DONTCACHE)
I
Ira Weiny 已提交
666
		d_mark_dontcache(VFS_I(ip));
667 668
	ip->i_udquot = NULL;
	ip->i_gdquot = NULL;
669
	ip->i_pdquot = NULL;
D
Dave Chinner 已提交
670 671 672 673 674 675 676
	xfs_iflags_set(ip, iflags);

	/* insert the new inode */
	spin_lock(&pag->pag_ici_lock);
	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
	if (unlikely(error)) {
		WARN_ON(error != -EEXIST);
677
		XFS_STATS_INC(mp, xs_ig_dup);
D
Dave Chinner 已提交
678
		error = -EAGAIN;
D
Dave Chinner 已提交
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698
		goto out_preload_end;
	}
	spin_unlock(&pag->pag_ici_lock);
	radix_tree_preload_end();

	*ipp = ip;
	return 0;

out_preload_end:
	spin_unlock(&pag->pag_ici_lock);
	radix_tree_preload_end();
	if (lock_flags)
		xfs_iunlock(ip, lock_flags);
out_destroy:
	__destroy_inode(VFS_I(ip));
	xfs_inode_free(ip);
	return error;
}

/*
699 700 701
 * Look up an inode by number in the given file system.  The inode is looked up
 * in the cache held in each AG.  If the inode is found in the cache, initialise
 * the vfs inode if necessary.
D
Dave Chinner 已提交
702
 *
703 704
 * If it is not in core, read it in from the file system's device, add it to the
 * cache and initialise the vfs inode.
D
Dave Chinner 已提交
705 706
 *
 * The inode is locked according to the value of the lock_flags parameter.
707 708
 * Inode lookup is only done during metadata operations and not as part of the
 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
D
Dave Chinner 已提交
709 710 711
 */
int
xfs_iget(
712 713 714 715 716 717
	struct xfs_mount	*mp,
	struct xfs_trans	*tp,
	xfs_ino_t		ino,
	uint			flags,
	uint			lock_flags,
	struct xfs_inode	**ipp)
D
Dave Chinner 已提交
718
{
719 720 721 722
	struct xfs_inode	*ip;
	struct xfs_perag	*pag;
	xfs_agino_t		agino;
	int			error;
D
Dave Chinner 已提交
723 724 725 726 727

	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);

	/* reject inode numbers outside existing AGs */
	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
D
Dave Chinner 已提交
728
		return -EINVAL;
D
Dave Chinner 已提交
729

730
	XFS_STATS_INC(mp, xs_ig_attempts);
731

D
Dave Chinner 已提交
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746
	/* get the perag structure and ensure that it's inode capable */
	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
	agino = XFS_INO_TO_AGINO(mp, ino);

again:
	error = 0;
	rcu_read_lock();
	ip = radix_tree_lookup(&pag->pag_ici_root, agino);

	if (ip) {
		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
		if (error)
			goto out_error_or_again;
	} else {
		rcu_read_unlock();
747
		if (flags & XFS_IGET_INCORE) {
748
			error = -ENODATA;
749 750
			goto out_error_or_again;
		}
751
		XFS_STATS_INC(mp, xs_ig_missed);
D
Dave Chinner 已提交
752 753 754 755 756 757 758 759 760 761 762

		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
							flags, lock_flags);
		if (error)
			goto out_error_or_again;
	}
	xfs_perag_put(pag);

	*ipp = ip;

	/*
763
	 * If we have a real type for an on-disk inode, we can setup the inode
D
Dave Chinner 已提交
764 765
	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
	 */
D
Dave Chinner 已提交
766
	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
767
		xfs_setup_existing_inode(ip);
D
Dave Chinner 已提交
768 769 770
	return 0;

out_error_or_again:
771
	if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
D
Dave Chinner 已提交
772 773 774 775 776 777 778
		delay(1);
		goto again;
	}
	xfs_perag_put(pag);
	return error;
}

779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812
/*
 * "Is this a cached inode that's also allocated?"
 *
 * Look up an inode by number in the given file system.  If the inode is
 * in cache and isn't in purgatory, return 1 if the inode is allocated
 * and 0 if it is not.  For all other cases (not in cache, being torn
 * down, etc.), return a negative error code.
 *
 * The caller has to prevent inode allocation and freeing activity,
 * presumably by locking the AGI buffer.   This is to ensure that an
 * inode cannot transition from allocated to freed until the caller is
 * ready to allow that.  If the inode is in an intermediate state (new,
 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
 * inode is not in the cache, -ENOENT will be returned.  The caller must
 * deal with these scenarios appropriately.
 *
 * This is a specialized use case for the online scrubber; if you're
 * reading this, you probably want xfs_iget.
 */
int
xfs_icache_inode_is_allocated(
	struct xfs_mount	*mp,
	struct xfs_trans	*tp,
	xfs_ino_t		ino,
	bool			*inuse)
{
	struct xfs_inode	*ip;
	int			error;

	error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
	if (error)
		return error;

	*inuse = !!(VFS_I(ip)->i_mode);
813
	xfs_irele(ip);
814 815 816
	return 0;
}

D
Dave Chinner 已提交
817 818
/*
 * Grab the inode for reclaim exclusively.
819 820 821 822 823 824 825 826 827 828 829 830 831 832
 *
 * We have found this inode via a lookup under RCU, so the inode may have
 * already been freed, or it may be in the process of being recycled by
 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
 * will not be set. Hence we need to check for both these flag conditions to
 * avoid inodes that are no longer reclaim candidates.
 *
 * Note: checking for other state flags here, under the i_flags_lock or not, is
 * racy and should be avoided. Those races should be resolved only after we have
 * ensured that we are able to reclaim this inode and the world can see that we
 * are going to reclaim it.
 *
 * Return true if we grabbed it, false otherwise.
D
Dave Chinner 已提交
833
 */
834
static bool
835
xfs_reclaim_igrab(
836
	struct xfs_inode	*ip,
837
	struct xfs_icwalk	*icw)
D
Dave Chinner 已提交
838
{
839 840
	ASSERT(rcu_read_lock_held());

D
Dave Chinner 已提交
841
	spin_lock(&ip->i_flags_lock);
842 843 844
	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
		/* not a reclaim candidate. */
D
Dave Chinner 已提交
845
		spin_unlock(&ip->i_flags_lock);
846
		return false;
D
Dave Chinner 已提交
847
	}
848 849 850

	/* Don't reclaim a sick inode unless the caller asked for it. */
	if (ip->i_sick &&
851
	    (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) {
852 853 854 855
		spin_unlock(&ip->i_flags_lock);
		return false;
	}

D
Dave Chinner 已提交
856 857
	__xfs_iflags_set(ip, XFS_IRECLAIM);
	spin_unlock(&ip->i_flags_lock);
858
	return true;
D
Dave Chinner 已提交
859 860
}

861
/*
862 863 864 865 866
 * Inode reclaim is non-blocking, so the default action if progress cannot be
 * made is to "requeue" the inode for reclaim by unlocking it and clearing the
 * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
 * blocking anymore and hence we can wait for the inode to be able to reclaim
 * it.
867
 *
868 869 870 871
 * We do no IO here - if callers require inodes to be cleaned they must push the
 * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
 * done in the background in a non-blocking manner, and enables memory reclaim
 * to make progress without blocking.
872
 */
873
static void
874
xfs_reclaim_inode(
875
	struct xfs_inode	*ip,
876
	struct xfs_perag	*pag)
877
{
878
	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
879

880
	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
881
		goto out;
882
	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
883
		goto out_iunlock;
884

885 886
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		xfs_iunpin_wait(ip);
887
		xfs_iflush_shutdown_abort(ip);
888 889
		goto reclaim;
	}
890
	if (xfs_ipincount(ip))
891
		goto out_clear_flush;
892
	if (!xfs_inode_clean(ip))
893
		goto out_clear_flush;
894

895
	xfs_iflags_clear(ip, XFS_IFLUSHING);
896
reclaim:
897
	trace_xfs_inode_reclaiming(ip);
898

899 900 901
	/*
	 * Because we use RCU freeing we need to ensure the inode always appears
	 * to be reclaimed with an invalid inode number when in the free state.
902
	 * We do this as early as possible under the ILOCK so that
903 904 905 906 907
	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
	 * detect races with us here. By doing this, we guarantee that once
	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
	 * it will see either a valid inode that will serialise correctly, or it
	 * will see an invalid inode that it can skip.
908 909 910 911
	 */
	spin_lock(&ip->i_flags_lock);
	ip->i_flags = XFS_IRECLAIM;
	ip->i_ino = 0;
912 913
	ip->i_sick = 0;
	ip->i_checked = 0;
914 915
	spin_unlock(&ip->i_flags_lock);

916
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
917

918
	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
919 920 921 922 923 924 925
	/*
	 * Remove the inode from the per-AG radix tree.
	 *
	 * Because radix_tree_delete won't complain even if the item was never
	 * added to the tree assert that it's been there before to catch
	 * problems with the inode life time early on.
	 */
926
	spin_lock(&pag->pag_ici_lock);
927
	if (!radix_tree_delete(&pag->pag_ici_root,
928
				XFS_INO_TO_AGINO(ip->i_mount, ino)))
929
		ASSERT(0);
930
	xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG);
931
	spin_unlock(&pag->pag_ici_lock);
932 933 934 935 936 937 938

	/*
	 * Here we do an (almost) spurious inode lock in order to coordinate
	 * with inode cache radix tree lookups.  This is because the lookup
	 * can reference the inodes in the cache without taking references.
	 *
	 * We make that OK here by ensuring that we wait until the inode is
939
	 * unlocked after the lookup before we go ahead and free it.
940
	 */
941
	xfs_ilock(ip, XFS_ILOCK_EXCL);
942
	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
943
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
944
	ASSERT(xfs_inode_clean(ip));
945

946
	__xfs_inode_free(ip);
947
	return;
948

949 950
out_clear_flush:
	xfs_iflags_clear(ip, XFS_IFLUSHING);
951
out_iunlock:
952
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
953
out:
954
	xfs_iflags_clear(ip, XFS_IRECLAIM);
955 956
}

957 958 959 960 961 962 963 964 965 966
/* Reclaim sick inodes if we're unmounting or the fs went down. */
static inline bool
xfs_want_reclaim_sick(
	struct xfs_mount	*mp)
{
	return (mp->m_flags & XFS_MOUNT_UNMOUNTING) ||
	       (mp->m_flags & XFS_MOUNT_NORECOVERY) ||
	       XFS_FORCED_SHUTDOWN(mp);
}

967
void
968
xfs_reclaim_inodes(
969
	struct xfs_mount	*mp)
970
{
971 972
	struct xfs_icwalk	icw = {
		.icw_flags	= 0,
973 974 975
	};

	if (xfs_want_reclaim_sick(mp))
976
		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
977

978
	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
979
		xfs_ail_push_all_sync(mp->m_ail);
980
		xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
Z
Zheng Bin 已提交
981
	}
982 983 984
}

/*
985 986 987 988 989
 * The shrinker infrastructure determines how many inodes we should scan for
 * reclaim. We want as many clean inodes ready to reclaim as possible, so we
 * push the AIL here. We also want to proactively free up memory if we can to
 * minimise the amount of work memory reclaim has to do so we kick the
 * background reclaim if it isn't already scheduled.
990
 */
991
long
992 993
xfs_reclaim_inodes_nr(
	struct xfs_mount	*mp,
994
	unsigned long		nr_to_scan)
995
{
996 997
	struct xfs_icwalk	icw = {
		.icw_flags	= XFS_ICWALK_FLAG_SCAN_LIMIT,
998
		.icw_scan_limit	= min_t(unsigned long, LONG_MAX, nr_to_scan),
999 1000
	};

1001
	if (xfs_want_reclaim_sick(mp))
1002
		icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK;
1003

1004
	/* kick background reclaimer and push the AIL */
D
Dave Chinner 已提交
1005
	xfs_reclaim_work_queue(mp);
1006
	xfs_ail_push_all(mp->m_ail);
1007

1008
	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw);
1009
	return 0;
1010
}
1011

1012 1013 1014 1015
/*
 * Return the number of reclaimable inodes in the filesystem for
 * the shrinker to determine how much to reclaim.
 */
1016
long
1017 1018 1019 1020 1021
xfs_reclaim_inodes_count(
	struct xfs_mount	*mp)
{
	struct xfs_perag	*pag;
	xfs_agnumber_t		ag = 0;
1022
	long			reclaimable = 0;
1023

1024 1025
	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
		ag = pag->pag_agno + 1;
1026 1027
		reclaimable += pag->pag_ici_reclaimable;
		xfs_perag_put(pag);
1028 1029 1030 1031
	}
	return reclaimable;
}

1032
STATIC bool
1033
xfs_icwalk_match_id(
1034
	struct xfs_inode	*ip,
1035
	struct xfs_icwalk	*icw)
1036
{
1037 1038
	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
	    !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1039
		return false;
1040

1041 1042
	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
	    !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1043
		return false;
1044

1045 1046
	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
	    ip->i_d.di_projid != icw->icw_prid)
1047
		return false;
1048

1049
	return true;
1050 1051
}

1052 1053 1054 1055
/*
 * A union-based inode filtering algorithm. Process the inode if any of the
 * criteria match. This is for global/internal scans only.
 */
1056
STATIC bool
1057
xfs_icwalk_match_id_union(
1058
	struct xfs_inode	*ip,
1059
	struct xfs_icwalk	*icw)
1060
{
1061 1062
	if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) &&
	    uid_eq(VFS_I(ip)->i_uid, icw->icw_uid))
1063
		return true;
1064

1065 1066
	if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) &&
	    gid_eq(VFS_I(ip)->i_gid, icw->icw_gid))
1067
		return true;
1068

1069 1070
	if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) &&
	    ip->i_d.di_projid == icw->icw_prid)
1071
		return true;
1072

1073
	return false;
1074 1075
}

1076 1077
/*
 * Is this inode @ip eligible for eof/cow block reclamation, given some
1078
 * filtering parameters @icw?  The inode is eligible if @icw is null or
1079 1080 1081
 * if the predicate functions match.
 */
static bool
1082
xfs_icwalk_match(
1083
	struct xfs_inode	*ip,
1084
	struct xfs_icwalk	*icw)
1085
{
1086
	bool			match;
1087

1088
	if (!icw)
1089 1090
		return true;

1091 1092
	if (icw->icw_flags & XFS_ICWALK_FLAG_UNION)
		match = xfs_icwalk_match_id_union(ip, icw);
1093
	else
1094
		match = xfs_icwalk_match_id(ip, icw);
1095 1096 1097 1098
	if (!match)
		return false;

	/* skip the inode if the file size is too small */
1099 1100
	if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) &&
	    XFS_ISIZE(ip) < icw->icw_min_file_size)
1101 1102 1103 1104 1105
		return false;

	return true;
}

1106 1107 1108 1109
/*
 * This is a fast pass over the inode cache to try to get reclaim moving on as
 * many inodes as possible in a short period of time. It kicks itself every few
 * seconds, as well as being kicked by the inode cache shrinker when memory
1110
 * goes low.
1111 1112 1113 1114 1115 1116 1117 1118
 */
void
xfs_reclaim_worker(
	struct work_struct *work)
{
	struct xfs_mount *mp = container_of(to_delayed_work(work),
					struct xfs_mount, m_reclaim_work);

1119
	xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL);
1120 1121 1122
	xfs_reclaim_work_queue(mp);
}

1123 1124 1125
STATIC int
xfs_inode_free_eofblocks(
	struct xfs_inode	*ip,
1126
	struct xfs_icwalk	*icw,
1127
	unsigned int		*lockflags)
1128
{
1129 1130
	bool			wait;

1131
	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1132

1133 1134 1135
	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
		return 0;

1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146
	if (!xfs_can_free_eofblocks(ip, false)) {
		/* inode could be preallocated or append-only */
		trace_xfs_inode_free_eofblocks_invalid(ip);
		xfs_inode_clear_eofblocks_tag(ip);
		return 0;
	}

	/*
	 * If the mapping is dirty the operation can block and wait for some
	 * time. Unless we are waiting, skip it.
	 */
1147
	if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
1148 1149
		return 0;

1150
	if (!xfs_icwalk_match(ip, icw))
1151
		return 0;
1152

1153 1154 1155 1156
	/*
	 * If the caller is waiting, return -EAGAIN to keep the background
	 * scanner moving and revisit the inode in a subsequent pass.
	 */
1157
	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1158 1159 1160
		if (wait)
			return -EAGAIN;
		return 0;
1161
	}
1162
	*lockflags |= XFS_IOLOCK_EXCL;
1163

1164
	return xfs_free_eofblocks(ip);
1165 1166
}

1167
static void
1168 1169 1170
xfs_blockgc_set_iflag(
	struct xfs_inode	*ip,
	unsigned long		iflag)
1171
{
1172 1173 1174 1175
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_perag	*pag;

	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1176

1177 1178 1179 1180
	/*
	 * Don't bother locking the AG and looking up in the radix trees
	 * if we already know that we have the tag set.
	 */
1181
	if (ip->i_flags & iflag)
1182 1183
		return;
	spin_lock(&ip->i_flags_lock);
1184
	ip->i_flags |= iflag;
1185 1186
	spin_unlock(&ip->i_flags_lock);

1187 1188 1189
	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
	spin_lock(&pag->pag_ici_lock);

1190 1191
	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
			XFS_ICI_BLOCKGC_TAG);
1192 1193 1194 1195 1196 1197

	spin_unlock(&pag->pag_ici_lock);
	xfs_perag_put(pag);
}

void
1198
xfs_inode_set_eofblocks_tag(
1199
	xfs_inode_t	*ip)
1200 1201
{
	trace_xfs_inode_set_eofblocks_tag(ip);
1202
	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
1203 1204 1205
}

static void
1206 1207 1208
xfs_blockgc_clear_iflag(
	struct xfs_inode	*ip,
	unsigned long		iflag)
1209
{
1210 1211 1212 1213 1214
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_perag	*pag;
	bool			clear_tag;

	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1215

1216
	spin_lock(&ip->i_flags_lock);
1217 1218
	ip->i_flags &= ~iflag;
	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
1219 1220
	spin_unlock(&ip->i_flags_lock);

1221 1222 1223
	if (!clear_tag)
		return;

1224 1225 1226
	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
	spin_lock(&pag->pag_ici_lock);

1227 1228
	xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
			XFS_ICI_BLOCKGC_TAG);
1229 1230 1231 1232 1233

	spin_unlock(&pag->pag_ici_lock);
	xfs_perag_put(pag);
}

1234 1235 1236 1237 1238
void
xfs_inode_clear_eofblocks_tag(
	xfs_inode_t	*ip)
{
	trace_xfs_inode_clear_eofblocks_tag(ip);
1239
	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
1240 1241 1242
}

/*
1243 1244 1245
 * Set ourselves up to free CoW blocks from this file.  If it's already clean
 * then we can bail out quickly, but otherwise we must back off if the file
 * is undergoing some kind of write.
1246
 */
1247 1248
static bool
xfs_prep_free_cowblocks(
1249
	struct xfs_inode	*ip)
1250
{
1251 1252 1253 1254
	/*
	 * Just clear the tag if we have an empty cow fork or none at all. It's
	 * possible the inode was fully unshared since it was originally tagged.
	 */
1255
	if (!xfs_inode_has_cow_data(ip)) {
1256 1257
		trace_xfs_inode_free_cowblocks_invalid(ip);
		xfs_inode_clear_cowblocks_tag(ip);
1258
		return false;
1259 1260 1261 1262 1263 1264
	}

	/*
	 * If the mapping is dirty or under writeback we cannot touch the
	 * CoW fork.  Leave it alone if we're in the midst of a directio.
	 */
1265 1266
	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1267 1268
	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
	    atomic_read(&VFS_I(ip)->i_dio_count))
1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288
		return false;

	return true;
}

/*
 * Automatic CoW Reservation Freeing
 *
 * These functions automatically garbage collect leftover CoW reservations
 * that were made on behalf of a cowextsize hint when we start to run out
 * of quota or when the reservations sit around for too long.  If the file
 * has dirty pages or is undergoing writeback, its CoW reservations will
 * be retained.
 *
 * The actual garbage collection piggybacks off the same code that runs
 * the speculative EOF preallocation garbage collector.
 */
STATIC int
xfs_inode_free_cowblocks(
	struct xfs_inode	*ip,
1289
	struct xfs_icwalk	*icw,
1290
	unsigned int		*lockflags)
1291
{
1292
	bool			wait;
1293 1294
	int			ret = 0;

1295
	wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC);
1296

1297 1298 1299
	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
		return 0;

1300
	if (!xfs_prep_free_cowblocks(ip))
1301 1302
		return 0;

1303
	if (!xfs_icwalk_match(ip, icw))
1304
		return 0;
1305

1306 1307 1308 1309
	/*
	 * If the caller is waiting, return -EAGAIN to keep the background
	 * scanner moving and revisit the inode in a subsequent pass.
	 */
1310 1311
	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
	    !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1312 1313 1314 1315
		if (wait)
			return -EAGAIN;
		return 0;
	}
1316 1317
	*lockflags |= XFS_IOLOCK_EXCL;

1318 1319
	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
		if (wait)
1320 1321
			return -EAGAIN;
		return 0;
1322
	}
1323
	*lockflags |= XFS_MMAPLOCK_EXCL;
1324

1325 1326 1327 1328
	/*
	 * Check again, nobody else should be able to dirty blocks or change
	 * the reflink iflag now that we have the first two locks held.
	 */
1329
	if (xfs_prep_free_cowblocks(ip))
1330
		ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
1331 1332 1333 1334 1335 1336 1337
	return ret;
}

void
xfs_inode_set_cowblocks_tag(
	xfs_inode_t	*ip)
{
1338
	trace_xfs_inode_set_cowblocks_tag(ip);
1339
	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
1340 1341 1342 1343 1344 1345
}

void
xfs_inode_clear_cowblocks_tag(
	xfs_inode_t	*ip)
{
1346
	trace_xfs_inode_clear_cowblocks_tag(ip);
1347
	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
1348
}
1349

1350 1351 1352 1353 1354 1355 1356 1357
#define for_each_perag_tag(mp, next_agno, pag, tag) \
	for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
		(pag) != NULL; \
		(next_agno) = (pag)->pag_agno + 1, \
		xfs_perag_put(pag), \
		(pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))


1358 1359
/* Disable post-EOF and CoW block auto-reclamation. */
void
1360
xfs_blockgc_stop(
1361 1362
	struct xfs_mount	*mp)
{
1363 1364 1365
	struct xfs_perag	*pag;
	xfs_agnumber_t		agno;

1366 1367 1368
	if (!xfs_clear_blockgc_enabled(mp))
		return;

1369 1370
	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
		cancel_delayed_work_sync(&pag->pag_blockgc_work);
1371
	trace_xfs_blockgc_stop(mp, __return_address);
1372 1373 1374 1375
}

/* Enable post-EOF and CoW block auto-reclamation. */
void
1376
xfs_blockgc_start(
1377 1378
	struct xfs_mount	*mp)
{
1379 1380 1381
	struct xfs_perag	*pag;
	xfs_agnumber_t		agno;

1382 1383 1384 1385
	if (xfs_set_blockgc_enabled(mp))
		return;

	trace_xfs_blockgc_start(mp, __return_address);
1386 1387
	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
		xfs_blockgc_queue(pag);
1388
}
1389

1390 1391
/* Don't try to run block gc on an inode that's in any of these states. */
#define XFS_BLOCKGC_NOGRAB_IFLAGS	(XFS_INEW | \
1392 1393
					 XFS_NEED_INACTIVE | \
					 XFS_INACTIVATING | \
1394 1395
					 XFS_IRECLAIMABLE | \
					 XFS_IRECLAIM)
1396
/*
1397 1398 1399
 * Decide if the given @ip is eligible for garbage collection of speculative
 * preallocations, and grab it if so.  Returns true if it's ready to go or
 * false if we should just ignore it.
1400 1401
 */
static bool
1402
xfs_blockgc_igrab(
1403
	struct xfs_inode	*ip)
1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
{
	struct inode		*inode = VFS_I(ip);

	ASSERT(rcu_read_lock_held());

	/* Check for stale RCU freed inode */
	spin_lock(&ip->i_flags_lock);
	if (!ip->i_ino)
		goto out_unlock_noent;

1414
	if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS)
1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
		goto out_unlock_noent;
	spin_unlock(&ip->i_flags_lock);

	/* nothing to sync during shutdown */
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return false;

	/* If we can't grab the inode, it must on it's way to reclaim. */
	if (!igrab(inode))
		return false;

	/* inode is valid */
	return true;

out_unlock_noent:
	spin_unlock(&ip->i_flags_lock);
	return false;
}

1434 1435 1436 1437
/* Scan one incore inode for block preallocations that we can remove. */
static int
xfs_blockgc_scan_inode(
	struct xfs_inode	*ip,
1438
	struct xfs_icwalk	*icw)
1439
{
1440
	unsigned int		lockflags = 0;
1441 1442
	int			error;

1443
	error = xfs_inode_free_eofblocks(ip, icw, &lockflags);
1444
	if (error)
1445
		goto unlock;
1446

1447
	error = xfs_inode_free_cowblocks(ip, icw, &lockflags);
1448 1449 1450
unlock:
	if (lockflags)
		xfs_iunlock(ip, lockflags);
1451
	xfs_irele(ip);
1452
	return error;
1453 1454
}

1455 1456 1457 1458 1459
/* Background worker that trims preallocated space. */
void
xfs_blockgc_worker(
	struct work_struct	*work)
{
1460 1461 1462
	struct xfs_perag	*pag = container_of(to_delayed_work(work),
					struct xfs_perag, pag_blockgc_work);
	struct xfs_mount	*mp = pag->pag_mount;
1463 1464
	int			error;

1465 1466
	trace_xfs_blockgc_worker(mp, __return_address);

1467
	error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL);
1468
	if (error)
1469 1470 1471
		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
				pag->pag_agno, error);
	xfs_blockgc_queue(pag);
1472 1473
}

1474
/*
1475 1476
 * Try to free space in the filesystem by purging inactive inodes, eofblocks
 * and cowblocks.
1477 1478 1479 1480
 */
int
xfs_blockgc_free_space(
	struct xfs_mount	*mp,
1481
	struct xfs_icwalk	*icw)
1482
{
1483 1484
	int			error;

1485
	trace_xfs_blockgc_free_space(mp, icw, _RET_IP_);
1486

1487 1488 1489 1490 1491 1492
	error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw);
	if (error)
		return error;

	xfs_inodegc_flush(mp);
	return 0;
1493 1494
}

1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522
/*
 * Reclaim all the free space that we can by scheduling the background blockgc
 * and inodegc workers immediately and waiting for them all to clear.
 */
void
xfs_blockgc_flush_all(
	struct xfs_mount	*mp)
{
	struct xfs_perag	*pag;
	xfs_agnumber_t		agno;

	trace_xfs_blockgc_flush_all(mp, __return_address);

	/*
	 * For each blockgc worker, move its queue time up to now.  If it
	 * wasn't queued, it will not be requeued.  Then flush whatever's
	 * left.
	 */
	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
		mod_delayed_work(pag->pag_mount->m_blockgc_wq,
				&pag->pag_blockgc_work, 0);

	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
		flush_delayed_work(&pag->pag_blockgc_work);

	xfs_inodegc_flush(mp);
}

1523
/*
1524 1525 1526 1527
 * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
 * quota caused an allocation failure, so we make a best effort by including
 * each quota under low free space conditions (less than 1% free space) in the
 * scan.
1528 1529
 *
 * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1530
 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or
1531
 * MMAPLOCK.
1532
 */
1533
int
1534 1535 1536 1537 1538
xfs_blockgc_free_dquots(
	struct xfs_mount	*mp,
	struct xfs_dquot	*udqp,
	struct xfs_dquot	*gdqp,
	struct xfs_dquot	*pdqp,
1539
	unsigned int		iwalk_flags)
1540
{
1541
	struct xfs_icwalk	icw = {0};
1542 1543
	bool			do_work = false;

1544 1545 1546
	if (!udqp && !gdqp && !pdqp)
		return 0;

1547
	/*
1548 1549
	 * Run a scan to free blocks using the union filter to cover all
	 * applicable quotas in a single scan.
1550
	 */
1551
	icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags;
1552

1553
	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1554 1555
		icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
		icw.icw_flags |= XFS_ICWALK_FLAG_UID;
1556
		do_work = true;
1557 1558
	}

1559
	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1560 1561
		icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
		icw.icw_flags |= XFS_ICWALK_FLAG_GID;
1562
		do_work = true;
1563 1564
	}

1565
	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1566 1567
		icw.icw_prid = pdqp->q_id;
		icw.icw_flags |= XFS_ICWALK_FLAG_PRID;
1568
		do_work = true;
1569 1570 1571
	}

	if (!do_work)
1572
		return 0;
1573

1574
	return xfs_blockgc_free_space(mp, &icw);
1575 1576 1577 1578 1579 1580
}

/* Run cow/eofblocks scans on the quotas attached to the inode. */
int
xfs_blockgc_free_quota(
	struct xfs_inode	*ip,
1581
	unsigned int		iwalk_flags)
1582 1583 1584 1585
{
	return xfs_blockgc_free_dquots(ip->i_mount,
			xfs_inode_dquot(ip, XFS_DQTYPE_USER),
			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1586
			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags);
1587
}
1588 1589 1590

/* XFS Inode Cache Walking Code */

1591 1592 1593 1594 1595 1596 1597 1598 1599
/*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
 * lookup reduction and stack usage. This is in the reclaim path, so we can't
 * be too greedy.
 */
#define XFS_LOOKUP_BATCH	32


1600 1601
/*
 * Decide if we want to grab this inode in anticipation of doing work towards
1602
 * the goal.
1603 1604 1605 1606
 */
static inline bool
xfs_icwalk_igrab(
	enum xfs_icwalk_goal	goal,
1607
	struct xfs_inode	*ip,
1608
	struct xfs_icwalk	*icw)
1609 1610 1611
{
	switch (goal) {
	case XFS_ICWALK_BLOCKGC:
1612
		return xfs_blockgc_igrab(ip);
1613
	case XFS_ICWALK_RECLAIM:
1614
		return xfs_reclaim_igrab(ip, icw);
1615 1616 1617 1618 1619
	default:
		return false;
	}
}

1620 1621 1622 1623
/*
 * Process an inode.  Each processing function must handle any state changes
 * made by the icwalk igrab function.  Return -EAGAIN to skip an inode.
 */
1624 1625 1626 1627
static inline int
xfs_icwalk_process_inode(
	enum xfs_icwalk_goal	goal,
	struct xfs_inode	*ip,
1628
	struct xfs_perag	*pag,
1629
	struct xfs_icwalk	*icw)
1630
{
1631
	int			error = 0;
1632 1633 1634

	switch (goal) {
	case XFS_ICWALK_BLOCKGC:
1635
		error = xfs_blockgc_scan_inode(ip, icw);
1636
		break;
1637 1638 1639
	case XFS_ICWALK_RECLAIM:
		xfs_reclaim_inode(ip, pag);
		break;
1640 1641 1642 1643
	}
	return error;
}

1644
/*
1645 1646
 * For a given per-AG structure @pag and a goal, grab qualifying inodes and
 * process them in some manner.
1647 1648
 */
static int
1649
xfs_icwalk_ag(
1650
	struct xfs_perag	*pag,
1651
	enum xfs_icwalk_goal	goal,
1652
	struct xfs_icwalk	*icw)
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663
{
	struct xfs_mount	*mp = pag->pag_mount;
	uint32_t		first_index;
	int			last_error = 0;
	int			skipped;
	bool			done;
	int			nr_found;

restart:
	done = false;
	skipped = 0;
1664 1665 1666 1667
	if (goal == XFS_ICWALK_RECLAIM)
		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
	else
		first_index = 0;
1668 1669 1670
	nr_found = 0;
	do {
		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1671
		unsigned int	tag = xfs_icwalk_tag(goal);
1672 1673 1674 1675 1676
		int		error = 0;
		int		i;

		rcu_read_lock();

1677
		if (tag == XFS_ICWALK_NULL_TAG)
1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
			nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
					(void **)batch, first_index,
					XFS_LOOKUP_BATCH);
		else
			nr_found = radix_tree_gang_lookup_tag(
					&pag->pag_ici_root,
					(void **) batch, first_index,
					XFS_LOOKUP_BATCH, tag);

		if (!nr_found) {
1688
			done = true;
1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699
			rcu_read_unlock();
			break;
		}

		/*
		 * Grab the inodes before we drop the lock. if we found
		 * nothing, nr == 0 and the loop will be skipped.
		 */
		for (i = 0; i < nr_found; i++) {
			struct xfs_inode *ip = batch[i];

1700
			if (done || !xfs_icwalk_igrab(goal, ip, icw))
1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
				batch[i] = NULL;

			/*
			 * Update the index for the next lookup. Catch
			 * overflows into the next AG range which can occur if
			 * we have inodes in the last block of the AG and we
			 * are currently pointing to the last inode.
			 *
			 * Because we may see inodes that are from the wrong AG
			 * due to RCU freeing and reallocation, only update the
			 * index if it lies in this AG. It was a race that lead
			 * us to see this inode, so another lookup from the
			 * same index will not find it again.
			 */
			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
				continue;
			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
				done = true;
		}

		/* unlock now we've grabbed the inodes. */
		rcu_read_unlock();

		for (i = 0; i < nr_found; i++) {
			if (!batch[i])
				continue;
1728
			error = xfs_icwalk_process_inode(goal, batch[i], pag,
1729
					icw);
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
			if (error == -EAGAIN) {
				skipped++;
				continue;
			}
			if (error && last_error != -EFSCORRUPTED)
				last_error = error;
		}

		/* bail out if the filesystem is corrupted.  */
		if (error == -EFSCORRUPTED)
			break;

		cond_resched();

1744 1745 1746
		if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) {
			icw->icw_scan_limit -= XFS_LOOKUP_BATCH;
			if (icw->icw_scan_limit <= 0)
1747 1748
				break;
		}
1749 1750
	} while (nr_found && !done);

1751 1752 1753 1754 1755 1756
	if (goal == XFS_ICWALK_RECLAIM) {
		if (done)
			first_index = 0;
		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
	}

1757 1758 1759 1760 1761 1762 1763 1764 1765
	if (skipped) {
		delay(1);
		goto restart;
	}
	return last_error;
}

/* Fetch the next (possibly tagged) per-AG structure. */
static inline struct xfs_perag *
1766
xfs_icwalk_get_perag(
1767 1768
	struct xfs_mount	*mp,
	xfs_agnumber_t		agno,
1769
	enum xfs_icwalk_goal	goal)
1770
{
1771 1772 1773
	unsigned int		tag = xfs_icwalk_tag(goal);

	if (tag == XFS_ICWALK_NULL_TAG)
1774 1775 1776 1777
		return xfs_perag_get(mp, agno);
	return xfs_perag_get_tag(mp, agno, tag);
}

1778
/* Walk all incore inodes to achieve a given goal. */
1779
static int
1780
xfs_icwalk(
1781
	struct xfs_mount	*mp,
1782
	enum xfs_icwalk_goal	goal,
1783
	struct xfs_icwalk	*icw)
1784 1785 1786 1787 1788 1789
{
	struct xfs_perag	*pag;
	int			error = 0;
	int			last_error = 0;
	xfs_agnumber_t		agno = 0;

1790
	while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) {
1791
		agno = pag->pag_agno + 1;
1792
		error = xfs_icwalk_ag(pag, goal, icw);
1793 1794 1795 1796 1797 1798 1799 1800
		xfs_perag_put(pag);
		if (error) {
			last_error = error;
			if (error == -EFSCORRUPTED)
				break;
		}
	}
	return last_error;
1801
	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID);
1802
}
1803 1804 1805 1806 1807 1808 1809

#ifdef DEBUG
static void
xfs_check_delalloc(
	struct xfs_inode	*ip,
	int			whichfork)
{
1810
	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829
	struct xfs_bmbt_irec	got;
	struct xfs_iext_cursor	icur;

	if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
		return;
	do {
		if (isnullstartblock(got.br_startblock)) {
			xfs_warn(ip->i_mount,
	"ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
				ip->i_ino,
				whichfork == XFS_DATA_FORK ? "data" : "cow",
				got.br_startoff, got.br_blockcount);
		}
	} while (xfs_iext_next_extent(ifp, &icur, &got));
}
#else
#define xfs_check_delalloc(ip, whichfork)	do { } while (0)
#endif

1830 1831 1832
/* Schedule the inode for reclaim. */
static void
xfs_inodegc_set_reclaimable(
1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847
	struct xfs_inode	*ip)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_perag	*pag;

	if (!XFS_FORCED_SHUTDOWN(mp) && ip->i_delayed_blks) {
		xfs_check_delalloc(ip, XFS_DATA_FORK);
		xfs_check_delalloc(ip, XFS_COW_FORK);
		ASSERT(0);
	}

	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
	spin_lock(&pag->pag_ici_lock);
	spin_lock(&ip->i_flags_lock);

1848 1849 1850
	trace_xfs_inode_set_reclaimable(ip);
	ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING);
	ip->i_flags |= XFS_IRECLAIMABLE;
1851 1852 1853 1854 1855 1856 1857
	xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino),
			XFS_ICI_RECLAIM_TAG);

	spin_unlock(&ip->i_flags_lock);
	spin_unlock(&pag->pag_ici_lock);
	xfs_perag_put(pag);
}
1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880

/*
 * Free all speculative preallocations and possibly even the inode itself.
 * This is the last chance to make changes to an otherwise unreferenced file
 * before incore reclamation happens.
 */
static void
xfs_inodegc_inactivate(
	struct xfs_inode	*ip)
{
	trace_xfs_inode_inactivating(ip);
	xfs_inactive(ip);
	xfs_inodegc_set_reclaimable(ip);
}

void
xfs_inodegc_worker(
	struct work_struct	*work)
{
	struct xfs_inodegc	*gc = container_of(work, struct xfs_inodegc,
							work);
	struct llist_node	*node = llist_del_all(&gc->list);
	struct xfs_inode	*ip, *n;
1881
	unsigned int		nofs_flag;
1882 1883 1884 1885 1886 1887

	WRITE_ONCE(gc->items, 0);

	if (!node)
		return;

1888 1889 1890 1891 1892 1893 1894
	/*
	 * We can allocate memory here while doing writeback on behalf of
	 * memory reclaim.  To avoid memory allocation deadlocks set the
	 * task-wide nofs context for the following operations.
	 */
	nofs_flag = memalloc_nofs_save();

1895
	ip = llist_entry(node, struct xfs_inode, i_gclist);
1896
	trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
1897

1898
	WRITE_ONCE(gc->shrinker_hits, 0);
1899 1900 1901 1902
	llist_for_each_entry_safe(ip, n, node, i_gclist) {
		xfs_iflags_set(ip, XFS_INACTIVATING);
		xfs_inodegc_inactivate(ip);
	}
1903 1904

	memalloc_nofs_restore(nofs_flag);
1905 1906 1907
}

/*
1908 1909
 * Force all currently queued inode inactivation work to run immediately and
 * wait for the work to finish.
1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920
 */
void
xfs_inodegc_flush(
	struct xfs_mount	*mp)
{
	if (!xfs_is_inodegc_enabled(mp))
		return;

	trace_xfs_inodegc_flush(mp, __return_address);

	xfs_inodegc_queue_all(mp);
1921
	flush_workqueue(mp->m_inodegc_wq);
1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935
}

/*
 * Flush all the pending work and then disable the inode inactivation background
 * workers and wait for them to stop.
 */
void
xfs_inodegc_stop(
	struct xfs_mount	*mp)
{
	if (!xfs_clear_inodegc_enabled(mp))
		return;

	xfs_inodegc_queue_all(mp);
1936
	drain_workqueue(mp->m_inodegc_wq);
1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955

	trace_xfs_inodegc_stop(mp, __return_address);
}

/*
 * Enable the inode inactivation background workers and schedule deferred inode
 * inactivation work if there is any.
 */
void
xfs_inodegc_start(
	struct xfs_mount	*mp)
{
	if (xfs_set_inodegc_enabled(mp))
		return;

	trace_xfs_inodegc_start(mp, __return_address);
	xfs_inodegc_queue_all(mp);
}

1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
#ifdef CONFIG_XFS_RT
static inline bool
xfs_inodegc_want_queue_rt_file(
	struct xfs_inode	*ip)
{
	struct xfs_mount	*mp = ip->i_mount;
	uint64_t		freertx;

	if (!XFS_IS_REALTIME_INODE(ip))
		return false;

	freertx = READ_ONCE(mp->m_sb.sb_frextents);
	return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT];
}
#else
# define xfs_inodegc_want_queue_rt_file(ip)	(false)
#endif /* CONFIG_XFS_RT */

1974 1975 1976 1977
/*
 * Schedule the inactivation worker when:
 *
 *  - We've accumulated more than one inode cluster buffer's worth of inodes.
1978
 *  - There is less than 5% free space left.
1979
 *  - Any of the quotas for this inode are near an enforcement limit.
1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
 */
static inline bool
xfs_inodegc_want_queue_work(
	struct xfs_inode	*ip,
	unsigned int		items)
{
	struct xfs_mount	*mp = ip->i_mount;

	if (items > mp->m_ino_geo.inodes_per_cluster)
		return true;

1991 1992 1993 1994 1995
	if (__percpu_counter_compare(&mp->m_fdblocks,
				mp->m_low_space[XFS_LOWSP_5_PCNT],
				XFS_FDBLOCKS_BATCH) < 0)
		return true;

1996 1997 1998
	if (xfs_inodegc_want_queue_rt_file(ip))
		return true;

1999 2000 2001 2002 2003 2004 2005 2006 2007
	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER))
		return true;

	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP))
		return true;

	if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ))
		return true;

2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
	return false;
}

/*
 * Upper bound on the number of inodes in each AG that can be queued for
 * inactivation at any given time, to avoid monopolizing the workqueue.
 */
#define XFS_INODEGC_MAX_BACKLOG		(4 * XFS_INODES_PER_CHUNK)

/*
 * Make the frontend wait for inactivations when:
 *
2020
 *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
2021 2022 2023 2024 2025 2026 2027 2028
 *  - The queue depth exceeds the maximum allowable percpu backlog.
 *
 * Note: If the current thread is running a transaction, we don't ever want to
 * wait for other transactions because that could introduce a deadlock.
 */
static inline bool
xfs_inodegc_want_flush_work(
	struct xfs_inode	*ip,
2029 2030
	unsigned int		items,
	unsigned int		shrinker_hits)
2031 2032 2033 2034
{
	if (current->journal_info)
		return false;

2035 2036 2037
	if (shrinker_hits > 0)
		return true;

2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055
	if (items > XFS_INODEGC_MAX_BACKLOG)
		return true;

	return false;
}

/*
 * Queue a background inactivation worker if there are inodes that need to be
 * inactivated and higher level xfs code hasn't disabled the background
 * workers.
 */
static void
xfs_inodegc_queue(
	struct xfs_inode	*ip)
{
	struct xfs_mount	*mp = ip->i_mount;
	struct xfs_inodegc	*gc;
	int			items;
2056
	unsigned int		shrinker_hits;
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066

	trace_xfs_inode_set_need_inactive(ip);
	spin_lock(&ip->i_flags_lock);
	ip->i_flags |= XFS_NEED_INACTIVE;
	spin_unlock(&ip->i_flags_lock);

	gc = get_cpu_ptr(mp->m_inodegc);
	llist_add(&ip->i_gclist, &gc->list);
	items = READ_ONCE(gc->items);
	WRITE_ONCE(gc->items, items + 1);
2067
	shrinker_hits = READ_ONCE(gc->shrinker_hits);
2068 2069 2070 2071 2072 2073 2074 2075 2076 2077
	put_cpu_ptr(gc);

	if (!xfs_is_inodegc_enabled(mp))
		return;

	if (xfs_inodegc_want_queue_work(ip, items)) {
		trace_xfs_inodegc_queue(mp, __return_address);
		queue_work(mp->m_inodegc_wq, &gc->work);
	}

2078
	if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157
		trace_xfs_inodegc_throttle(mp, __return_address);
		flush_work(&gc->work);
	}
}

/*
 * Fold the dead CPU inodegc queue into the current CPUs queue.
 */
void
xfs_inodegc_cpu_dead(
	struct xfs_mount	*mp,
	unsigned int		dead_cpu)
{
	struct xfs_inodegc	*dead_gc, *gc;
	struct llist_node	*first, *last;
	unsigned int		count = 0;

	dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
	cancel_work_sync(&dead_gc->work);

	if (llist_empty(&dead_gc->list))
		return;

	first = dead_gc->list.first;
	last = first;
	while (last->next) {
		last = last->next;
		count++;
	}
	dead_gc->list.first = NULL;
	dead_gc->items = 0;

	/* Add pending work to current CPU */
	gc = get_cpu_ptr(mp->m_inodegc);
	llist_add_batch(first, last, &gc->list);
	count += READ_ONCE(gc->items);
	WRITE_ONCE(gc->items, count);
	put_cpu_ptr(gc);

	if (xfs_is_inodegc_enabled(mp)) {
		trace_xfs_inodegc_queue(mp, __return_address);
		queue_work(mp->m_inodegc_wq, &gc->work);
	}
}

/*
 * We set the inode flag atomically with the radix tree tag.  Once we get tag
 * lookups on the radix tree, this inode flag can go away.
 *
 * We always use background reclaim here because even if the inode is clean, it
 * still may be under IO and hence we have wait for IO completion to occur
 * before we can reclaim the inode. The background reclaim path handles this
 * more efficiently than we can here, so simply let background reclaim tear down
 * all inodes.
 */
void
xfs_inode_mark_reclaimable(
	struct xfs_inode	*ip)
{
	struct xfs_mount	*mp = ip->i_mount;
	bool			need_inactive;

	XFS_STATS_INC(mp, vn_reclaim);

	/*
	 * We should never get here with any of the reclaim flags already set.
	 */
	ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS));

	need_inactive = xfs_inode_needs_inactive(ip);
	if (need_inactive) {
		xfs_inodegc_queue(ip);
		return;
	}

	/* Going straight to reclaim, so drop the dquots. */
	xfs_qm_dqdetach(ip);
	xfs_inodegc_set_reclaimable(ip);
}
2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245

/*
 * Register a phony shrinker so that we can run background inodegc sooner when
 * there's memory pressure.  Inactivation does not itself free any memory but
 * it does make inodes reclaimable, which eventually frees memory.
 *
 * The count function, seek value, and batch value are crafted to trigger the
 * scan function during the second round of scanning.  Hopefully this means
 * that we reclaimed enough memory that initiating metadata transactions won't
 * make things worse.
 */
#define XFS_INODEGC_SHRINKER_COUNT	(1UL << DEF_PRIORITY)
#define XFS_INODEGC_SHRINKER_BATCH	((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)

static unsigned long
xfs_inodegc_shrinker_count(
	struct shrinker		*shrink,
	struct shrink_control	*sc)
{
	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
						   m_inodegc_shrinker);
	struct xfs_inodegc	*gc;
	int			cpu;

	if (!xfs_is_inodegc_enabled(mp))
		return 0;

	for_each_online_cpu(cpu) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list))
			return XFS_INODEGC_SHRINKER_COUNT;
	}

	return 0;
}

static unsigned long
xfs_inodegc_shrinker_scan(
	struct shrinker		*shrink,
	struct shrink_control	*sc)
{
	struct xfs_mount	*mp = container_of(shrink, struct xfs_mount,
						   m_inodegc_shrinker);
	struct xfs_inodegc	*gc;
	int			cpu;
	bool			no_items = true;

	if (!xfs_is_inodegc_enabled(mp))
		return SHRINK_STOP;

	trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);

	for_each_online_cpu(cpu) {
		gc = per_cpu_ptr(mp->m_inodegc, cpu);
		if (!llist_empty(&gc->list)) {
			unsigned int	h = READ_ONCE(gc->shrinker_hits);

			WRITE_ONCE(gc->shrinker_hits, h + 1);
			queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
			no_items = false;
		}
	}

	/*
	 * If there are no inodes to inactivate, we don't want the shrinker
	 * to think there's deferred work to call us back about.
	 */
	if (no_items)
		return LONG_MAX;

	return SHRINK_STOP;
}

/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
int
xfs_inodegc_register_shrinker(
	struct xfs_mount	*mp)
{
	struct shrinker		*shrink = &mp->m_inodegc_shrinker;

	shrink->count_objects = xfs_inodegc_shrinker_count;
	shrink->scan_objects = xfs_inodegc_shrinker_scan;
	shrink->seeks = 0;
	shrink->flags = SHRINKER_NONSLAB;
	shrink->batch = XFS_INODEGC_SHRINKER_BATCH;

	return register_shrinker(shrink);
}