xfs_icache.c 18.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
22
#include "xfs_log_priv.h"
23 24
#include "xfs_inum.h"
#include "xfs_trans.h"
25
#include "xfs_trans_priv.h"
26 27 28 29 30 31 32 33 34 35
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_inode_item.h"
C
Christoph Hellwig 已提交
36
#include "xfs_quota.h"
C
Christoph Hellwig 已提交
37
#include "xfs_trace.h"
38
#include "xfs_fsops.h"
39
#include "xfs_icache.h"
40

41 42 43
#include <linux/kthread.h>
#include <linux/freezer.h>

44 45 46 47 48 49 50 51
/*
 * The inode lookup is done in batches to keep the amount of lock traffic and
 * radix tree lookups to a minimum. The batch size is a trade off between
 * lookup reduction and stack usage. This is in the reclaim path, so we can't
 * be too greedy.
 */
#define XFS_LOOKUP_BATCH	32

52 53 54 55 56 57
STATIC int
xfs_inode_ag_walk_grab(
	struct xfs_inode	*ip)
{
	struct inode		*inode = VFS_I(ip);

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
	ASSERT(rcu_read_lock_held());

	/*
	 * check for stale RCU freed inode
	 *
	 * If the inode has been reallocated, it doesn't matter if it's not in
	 * the AG we are walking - we are walking for writeback, so if it
	 * passes all the "valid inode" checks and is dirty, then we'll write
	 * it back anyway.  If it has been reallocated and still being
	 * initialised, the XFS_INEW check below will catch it.
	 */
	spin_lock(&ip->i_flags_lock);
	if (!ip->i_ino)
		goto out_unlock_noent;

	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
		goto out_unlock_noent;
	spin_unlock(&ip->i_flags_lock);

78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
	/* nothing to sync during shutdown */
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		return EFSCORRUPTED;

	/* If we can't grab the inode, it must on it's way to reclaim. */
	if (!igrab(inode))
		return ENOENT;

	if (is_bad_inode(inode)) {
		IRELE(ip);
		return ENOENT;
	}

	/* inode is valid */
	return 0;
93 94 95 96

out_unlock_noent:
	spin_unlock(&ip->i_flags_lock);
	return ENOENT;
97 98
}

99 100 101
STATIC int
xfs_inode_ag_walk(
	struct xfs_mount	*mp,
D
Dave Chinner 已提交
102
	struct xfs_perag	*pag,
103 104
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
105
	int			flags)
106 107 108 109
{
	uint32_t		first_index;
	int			last_error = 0;
	int			skipped;
110
	int			done;
111
	int			nr_found;
112 113

restart:
114
	done = 0;
115 116
	skipped = 0;
	first_index = 0;
117
	nr_found = 0;
118
	do {
119
		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
120
		int		error = 0;
121
		int		i;
122

123
		rcu_read_lock();
124
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
125 126
					(void **)batch, first_index,
					XFS_LOOKUP_BATCH);
127
		if (!nr_found) {
128
			rcu_read_unlock();
129
			break;
130
		}
131

132
		/*
133 134
		 * Grab the inodes before we drop the lock. if we found
		 * nothing, nr == 0 and the loop will be skipped.
135
		 */
136 137 138 139 140 141 142
		for (i = 0; i < nr_found; i++) {
			struct xfs_inode *ip = batch[i];

			if (done || xfs_inode_ag_walk_grab(ip))
				batch[i] = NULL;

			/*
143 144 145 146 147 148 149 150 151 152
			 * Update the index for the next lookup. Catch
			 * overflows into the next AG range which can occur if
			 * we have inodes in the last block of the AG and we
			 * are currently pointing to the last inode.
			 *
			 * Because we may see inodes that are from the wrong AG
			 * due to RCU freeing and reallocation, only update the
			 * index if it lies in this AG. It was a race that lead
			 * us to see this inode, so another lookup from the
			 * same index will not find it again.
153
			 */
154 155
			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
				continue;
156 157 158
			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
				done = 1;
159
		}
160 161

		/* unlock now we've grabbed the inodes. */
162
		rcu_read_unlock();
163

164 165 166 167 168 169 170 171 172 173 174
		for (i = 0; i < nr_found; i++) {
			if (!batch[i])
				continue;
			error = execute(batch[i], pag, flags);
			IRELE(batch[i]);
			if (error == EAGAIN) {
				skipped++;
				continue;
			}
			if (error && last_error != EFSCORRUPTED)
				last_error = error;
175
		}
176 177

		/* bail out if the filesystem is corrupted.  */
178 179 180
		if (error == EFSCORRUPTED)
			break;

181 182
		cond_resched();

183
	} while (nr_found && !done);
184 185 186 187 188 189 190 191

	if (skipped) {
		delay(1);
		goto restart;
	}
	return last_error;
}

192
int
193 194 195 196
xfs_inode_ag_iterator(
	struct xfs_mount	*mp,
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
197
	int			flags)
198
{
199
	struct xfs_perag	*pag;
200 201 202 203
	int			error = 0;
	int			last_error = 0;
	xfs_agnumber_t		ag;

204
	ag = 0;
205 206 207
	while ((pag = xfs_perag_get(mp, ag))) {
		ag = pag->pag_agno + 1;
		error = xfs_inode_ag_walk(mp, pag, execute, flags);
D
Dave Chinner 已提交
208
		xfs_perag_put(pag);
209 210 211 212 213 214 215 216 217
		if (error) {
			last_error = error;
			if (error == EFSCORRUPTED)
				break;
		}
	}
	return XFS_ERROR(last_error);
}

218 219 220
/*
 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 * isn't a reclaim pass already in progress. By default it runs every 5s based
D
Dave Chinner 已提交
221
 * on the xfs periodic sync default of 30s. Perhaps this should have it's own
222 223 224 225
 * tunable, but that can be done if this method proves to be ineffective or too
 * aggressive.
 */
static void
D
Dave Chinner 已提交
226
xfs_reclaim_work_queue(
227
	struct xfs_mount        *mp)
228 229
{

230 231
	rcu_read_lock();
	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
D
Dave Chinner 已提交
232
		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
233
			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
234
	}
235 236
	rcu_read_unlock();
}
237

238 239 240 241 242 243 244
/*
 * This is a fast pass over the inode cache to try to get reclaim moving on as
 * many inodes as possible in a short period of time. It kicks itself every few
 * seconds, as well as being kicked by the inode cache shrinker when memory
 * goes low. It scans as quickly as possible avoiding locked inodes or those
 * already being flushed, and once done schedules a future pass.
 */
D
Dave Chinner 已提交
245
void
246 247 248 249 250 251 252
xfs_reclaim_worker(
	struct work_struct *work)
{
	struct xfs_mount *mp = container_of(to_delayed_work(work),
					struct xfs_mount, m_reclaim_work);

	xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
D
Dave Chinner 已提交
253
	xfs_reclaim_work_queue(mp);
254 255
}

256 257 258 259 260 261 262 263
void
__xfs_inode_set_reclaim_tag(
	struct xfs_perag	*pag,
	struct xfs_inode	*ip)
{
	radix_tree_tag_set(&pag->pag_ici_root,
			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
			   XFS_ICI_RECLAIM_TAG);
264 265 266 267 268 269 270 271

	if (!pag->pag_ici_reclaimable) {
		/* propagate the reclaim tag up into the perag radix tree */
		spin_lock(&ip->i_mount->m_perag_lock);
		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
				XFS_ICI_RECLAIM_TAG);
		spin_unlock(&ip->i_mount->m_perag_lock);
272 273

		/* schedule periodic background inode reclaim */
D
Dave Chinner 已提交
274
		xfs_reclaim_work_queue(ip->i_mount);
275

276 277 278
		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
							-1, _RET_IP_);
	}
279
	pag->pag_ici_reclaimable++;
280 281
}

D
David Chinner 已提交
282 283 284 285 286
/*
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
 * can go away.
 */
287 288 289 290
void
xfs_inode_set_reclaim_tag(
	xfs_inode_t	*ip)
{
D
Dave Chinner 已提交
291 292
	struct xfs_mount *mp = ip->i_mount;
	struct xfs_perag *pag;
293

D
Dave Chinner 已提交
294
	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
295
	spin_lock(&pag->pag_ici_lock);
296
	spin_lock(&ip->i_flags_lock);
297
	__xfs_inode_set_reclaim_tag(pag, ip);
D
David Chinner 已提交
298
	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
299
	spin_unlock(&ip->i_flags_lock);
300
	spin_unlock(&pag->pag_ici_lock);
D
Dave Chinner 已提交
301
	xfs_perag_put(pag);
302 303
}

304 305
STATIC void
__xfs_inode_clear_reclaim(
306 307 308
	xfs_perag_t	*pag,
	xfs_inode_t	*ip)
{
309
	pag->pag_ici_reclaimable--;
310 311 312 313 314 315 316 317 318 319
	if (!pag->pag_ici_reclaimable) {
		/* clear the reclaim tag from the perag radix tree */
		spin_lock(&ip->i_mount->m_perag_lock);
		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
				XFS_ICI_RECLAIM_TAG);
		spin_unlock(&ip->i_mount->m_perag_lock);
		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
							-1, _RET_IP_);
	}
320 321
}

322 323 324 325 326 327 328 329 330 331 332
void
__xfs_inode_clear_reclaim_tag(
	xfs_mount_t	*mp,
	xfs_perag_t	*pag,
	xfs_inode_t	*ip)
{
	radix_tree_tag_clear(&pag->pag_ici_root,
			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
	__xfs_inode_clear_reclaim(pag, ip);
}

D
Dave Chinner 已提交
333 334 335 336 337 338 339 340 341
/*
 * Grab the inode for reclaim exclusively.
 * Return 0 if we grabbed it, non-zero otherwise.
 */
STATIC int
xfs_reclaim_inode_grab(
	struct xfs_inode	*ip,
	int			flags)
{
342 343 344 345 346
	ASSERT(rcu_read_lock_held());

	/* quick check for stale RCU freed inode */
	if (!ip->i_ino)
		return 1;
D
Dave Chinner 已提交
347 348

	/*
349 350 351
	 * If we are asked for non-blocking operation, do unlocked checks to
	 * see if the inode already is being flushed or in reclaim to avoid
	 * lock traffic.
D
Dave Chinner 已提交
352 353
	 */
	if ((flags & SYNC_TRYLOCK) &&
354
	    __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
D
Dave Chinner 已提交
355 356 357 358 359 360
		return 1;

	/*
	 * The radix tree lock here protects a thread in xfs_iget from racing
	 * with us starting reclaim on the inode.  Once we have the
	 * XFS_IRECLAIM flag set it will not touch us.
361 362 363 364 365
	 *
	 * Due to RCU lookup, we may find inodes that have been freed and only
	 * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
	 * aren't candidates for reclaim at all, so we must check the
	 * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
D
Dave Chinner 已提交
366 367
	 */
	spin_lock(&ip->i_flags_lock);
368 369 370
	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
		/* not a reclaim candidate. */
D
Dave Chinner 已提交
371 372 373 374 375 376 377 378
		spin_unlock(&ip->i_flags_lock);
		return 1;
	}
	__xfs_iflags_set(ip, XFS_IRECLAIM);
	spin_unlock(&ip->i_flags_lock);
	return 0;
}

379
/*
380 381
 * Inodes in different states need to be treated differently. The following
 * table lists the inode states and the reclaim actions necessary:
382 383 384 385 386 387 388
 *
 *	inode state	     iflush ret		required action
 *      ---------------      ----------         ---------------
 *	bad			-		reclaim
 *	shutdown		EIO		unpin and reclaim
 *	clean, unpinned		0		reclaim
 *	stale, unpinned		0		reclaim
389 390
 *	clean, pinned(*)	0		requeue
 *	stale, pinned		EAGAIN		requeue
391 392
 *	dirty, async		-		requeue
 *	dirty, sync		0		reclaim
393 394 395 396
 *
 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 * handled anyway given the order of checks implemented.
 *
397 398
 * Also, because we get the flush lock first, we know that any inode that has
 * been flushed delwri has had the flush completed by the time we check that
399
 * the inode is clean.
400
 *
401 402 403 404 405 406
 * Note that because the inode is flushed delayed write by AIL pushing, the
 * flush lock may already be held here and waiting on it can result in very
 * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
 * the caller should push the AIL first before trying to reclaim inodes to
 * minimise the amount of time spent waiting.  For background relaim, we only
 * bother to reclaim clean inodes anyway.
407
 *
408 409 410
 * Hence the order of actions after gaining the locks should be:
 *	bad		=> reclaim
 *	shutdown	=> unpin and reclaim
411
 *	pinned, async	=> requeue
412
 *	pinned, sync	=> unpin
413 414
 *	stale		=> reclaim
 *	clean		=> reclaim
415
 *	dirty, async	=> requeue
416
 *	dirty, sync	=> flush, wait and reclaim
417
 */
418
STATIC int
419
xfs_reclaim_inode(
420 421
	struct xfs_inode	*ip,
	struct xfs_perag	*pag,
422
	int			sync_mode)
423
{
424 425
	struct xfs_buf		*bp = NULL;
	int			error;
426

427 428
restart:
	error = 0;
429
	xfs_ilock(ip, XFS_ILOCK_EXCL);
430 431 432 433 434
	if (!xfs_iflock_nowait(ip)) {
		if (!(sync_mode & SYNC_WAIT))
			goto out;
		xfs_iflock(ip);
	}
435

436 437 438 439
	if (is_bad_inode(VFS_I(ip)))
		goto reclaim;
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		xfs_iunpin_wait(ip);
440
		xfs_iflush_abort(ip, false);
441 442
		goto reclaim;
	}
443
	if (xfs_ipincount(ip)) {
444 445
		if (!(sync_mode & SYNC_WAIT))
			goto out_ifunlock;
446
		xfs_iunpin_wait(ip);
447
	}
448 449 450 451 452
	if (xfs_iflags_test(ip, XFS_ISTALE))
		goto reclaim;
	if (xfs_inode_clean(ip))
		goto reclaim;

453 454 455 456 457 458 459
	/*
	 * Never flush out dirty data during non-blocking reclaim, as it would
	 * just contend with AIL pushing trying to do the same job.
	 */
	if (!(sync_mode & SYNC_WAIT))
		goto out_ifunlock;

460 461 462
	/*
	 * Now we have an inode that needs flushing.
	 *
463
	 * Note that xfs_iflush will never block on the inode buffer lock, as
464
	 * xfs_ifree_cluster() can lock the inode buffer before it locks the
465
	 * ip->i_lock, and we are doing the exact opposite here.  As a result,
466 467
	 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
	 * result in an ABBA deadlock with xfs_ifree_cluster().
468 469 470 471
	 *
	 * As xfs_ifree_cluser() must gather all inodes that are active in the
	 * cache to mark them stale, if we hit this case we don't actually want
	 * to do IO here - we want the inode marked stale so we can simply
472 473 474
	 * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
	 * inode, back off and try again.  Hopefully the next pass through will
	 * see the stale flag set on the inode.
475
	 */
476
	error = xfs_iflush(ip, &bp);
477 478 479 480 481
	if (error == EAGAIN) {
		xfs_iunlock(ip, XFS_ILOCK_EXCL);
		/* backoff longer than in xfs_ifree_cluster */
		delay(2);
		goto restart;
482 483
	}

484 485 486 487 488 489
	if (!error) {
		error = xfs_bwrite(bp);
		xfs_buf_relse(bp);
	}

	xfs_iflock(ip);
490 491
reclaim:
	xfs_ifunlock(ip);
492
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
493 494 495 496 497 498 499 500 501

	XFS_STATS_INC(xs_ig_reclaims);
	/*
	 * Remove the inode from the per-AG radix tree.
	 *
	 * Because radix_tree_delete won't complain even if the item was never
	 * added to the tree assert that it's been there before to catch
	 * problems with the inode life time early on.
	 */
502
	spin_lock(&pag->pag_ici_lock);
503 504 505
	if (!radix_tree_delete(&pag->pag_ici_root,
				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
		ASSERT(0);
506
	__xfs_inode_clear_reclaim(pag, ip);
507
	spin_unlock(&pag->pag_ici_lock);
508 509 510 511 512 513 514

	/*
	 * Here we do an (almost) spurious inode lock in order to coordinate
	 * with inode cache radix tree lookups.  This is because the lookup
	 * can reference the inodes in the cache without taking references.
	 *
	 * We make that OK here by ensuring that we wait until the inode is
515
	 * unlocked after the lookup before we go ahead and free it.
516
	 */
517
	xfs_ilock(ip, XFS_ILOCK_EXCL);
518
	xfs_qm_dqdetach(ip);
519
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
520 521

	xfs_inode_free(ip);
522
	return error;
523 524 525 526 527 528 529 530 531

out_ifunlock:
	xfs_ifunlock(ip);
out:
	xfs_iflags_clear(ip, XFS_IRECLAIM);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	/*
	 * We could return EAGAIN here to make reclaim rescan the inode tree in
	 * a short while. However, this just burns CPU time scanning the tree
D
Dave Chinner 已提交
532 533 534
	 * waiting for IO to complete and the reclaim work never goes back to
	 * the idle state. Instead, return 0 to let the next scheduled
	 * background reclaim attempt to reclaim the inode again.
535 536
	 */
	return 0;
537 538
}

539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
/*
 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
 * corrupted, we still want to try to reclaim all the inodes. If we don't,
 * then a shut down during filesystem unmount reclaim walk leak all the
 * unreclaimed inodes.
 */
int
xfs_reclaim_inodes_ag(
	struct xfs_mount	*mp,
	int			flags,
	int			*nr_to_scan)
{
	struct xfs_perag	*pag;
	int			error = 0;
	int			last_error = 0;
	xfs_agnumber_t		ag;
555 556
	int			trylock = flags & SYNC_TRYLOCK;
	int			skipped;
557

558
restart:
559
	ag = 0;
560
	skipped = 0;
561 562 563
	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
		unsigned long	first_index = 0;
		int		done = 0;
D
Dave Chinner 已提交
564
		int		nr_found = 0;
565 566 567

		ag = pag->pag_agno + 1;

568 569 570
		if (trylock) {
			if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
				skipped++;
571
				xfs_perag_put(pag);
572 573 574 575 576 577
				continue;
			}
			first_index = pag->pag_ici_reclaim_cursor;
		} else
			mutex_lock(&pag->pag_ici_reclaim_lock);

578
		do {
D
Dave Chinner 已提交
579 580
			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
			int	i;
581

582
			rcu_read_lock();
D
Dave Chinner 已提交
583 584 585 586
			nr_found = radix_tree_gang_lookup_tag(
					&pag->pag_ici_root,
					(void **)batch, first_index,
					XFS_LOOKUP_BATCH,
587 588
					XFS_ICI_RECLAIM_TAG);
			if (!nr_found) {
589
				done = 1;
590
				rcu_read_unlock();
591 592 593 594
				break;
			}

			/*
D
Dave Chinner 已提交
595 596
			 * Grab the inodes before we drop the lock. if we found
			 * nothing, nr == 0 and the loop will be skipped.
597
			 */
D
Dave Chinner 已提交
598 599 600 601 602 603 604 605 606 607 608 609
			for (i = 0; i < nr_found; i++) {
				struct xfs_inode *ip = batch[i];

				if (done || xfs_reclaim_inode_grab(ip, flags))
					batch[i] = NULL;

				/*
				 * Update the index for the next lookup. Catch
				 * overflows into the next AG range which can
				 * occur if we have inodes in the last block of
				 * the AG and we are currently pointing to the
				 * last inode.
610 611 612 613 614 615 616
				 *
				 * Because we may see inodes that are from the
				 * wrong AG due to RCU freeing and
				 * reallocation, only update the index if it
				 * lies in this AG. It was a race that lead us
				 * to see this inode, so another lookup from
				 * the same index will not find it again.
D
Dave Chinner 已提交
617
				 */
618 619 620
				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
								pag->pag_agno)
					continue;
D
Dave Chinner 已提交
621 622 623 624
				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
					done = 1;
			}
625

D
Dave Chinner 已提交
626
			/* unlock now we've grabbed the inodes. */
627
			rcu_read_unlock();
D
Dave Chinner 已提交
628 629 630 631 632 633 634 635 636 637

			for (i = 0; i < nr_found; i++) {
				if (!batch[i])
					continue;
				error = xfs_reclaim_inode(batch[i], pag, flags);
				if (error && last_error != EFSCORRUPTED)
					last_error = error;
			}

			*nr_to_scan -= XFS_LOOKUP_BATCH;
638

639 640
			cond_resched();

D
Dave Chinner 已提交
641
		} while (nr_found && !done && *nr_to_scan > 0);
642

643 644 645 646 647
		if (trylock && !done)
			pag->pag_ici_reclaim_cursor = first_index;
		else
			pag->pag_ici_reclaim_cursor = 0;
		mutex_unlock(&pag->pag_ici_reclaim_lock);
648 649
		xfs_perag_put(pag);
	}
650 651 652 653 654 655 656 657

	/*
	 * if we skipped any AG, and we still have scan count remaining, do
	 * another pass this time using blocking reclaim semantics (i.e
	 * waiting on the reclaim locks and ignoring the reclaim cursors). This
	 * ensure that when we get more reclaimers than AGs we block rather
	 * than spin trying to execute reclaim.
	 */
658
	if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
659 660 661
		trylock = 0;
		goto restart;
	}
662 663 664
	return XFS_ERROR(last_error);
}

665 666 667 668 669
int
xfs_reclaim_inodes(
	xfs_mount_t	*mp,
	int		mode)
{
670 671 672
	int		nr_to_scan = INT_MAX;

	return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
673 674 675
}

/*
676
 * Scan a certain number of inodes for reclaim.
677 678
 *
 * When called we make sure that there is a background (fast) inode reclaim in
679
 * progress, while we will throttle the speed of reclaim via doing synchronous
680 681 682
 * reclaim of inodes. That means if we come across dirty inodes, we wait for
 * them to be cleaned, which we hope will not be very long due to the
 * background walker having already kicked the IO off on those dirty inodes.
683
 */
684 685 686 687
void
xfs_reclaim_inodes_nr(
	struct xfs_mount	*mp,
	int			nr_to_scan)
688
{
689
	/* kick background reclaimer and push the AIL */
D
Dave Chinner 已提交
690
	xfs_reclaim_work_queue(mp);
691
	xfs_ail_push_all(mp->m_ail);
692

693 694
	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
}
695

696 697 698 699 700 701 702 703 704 705 706
/*
 * Return the number of reclaimable inodes in the filesystem for
 * the shrinker to determine how much to reclaim.
 */
int
xfs_reclaim_inodes_count(
	struct xfs_mount	*mp)
{
	struct xfs_perag	*pag;
	xfs_agnumber_t		ag = 0;
	int			reclaimable = 0;
707

708 709
	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
		ag = pag->pag_agno + 1;
710 711
		reclaimable += pag->pag_ici_reclaimable;
		xfs_perag_put(pag);
712 713 714 715
	}
	return reclaimable;
}