xfs_sync.c 23.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
/*
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
#include "xfs_dinode.h"
#include "xfs_error.h"
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_inode_item.h"
C
Christoph Hellwig 已提交
35
#include "xfs_quota.h"
C
Christoph Hellwig 已提交
36
#include "xfs_trace.h"
37
#include "xfs_fsops.h"
38

39 40 41
#include <linux/kthread.h>
#include <linux/freezer.h>

42

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
STATIC xfs_inode_t *
xfs_inode_ag_lookup(
	struct xfs_mount	*mp,
	struct xfs_perag	*pag,
	uint32_t		*first_index,
	int			tag)
{
	int			nr_found;
	struct xfs_inode	*ip;

	/*
	 * use a gang lookup to find the next inode in the tree
	 * as the tree is sparse and a gang lookup walks to find
	 * the number of objects requested.
	 */
	if (tag == XFS_ICI_NO_TAG) {
		nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
				(void **)&ip, *first_index, 1);
	} else {
		nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
				(void **)&ip, *first_index, 1, tag);
	}
	if (!nr_found)
66
		return NULL;
67 68 69 70 71 72 73 74 75

	/*
	 * Update the index for the next lookup. Catch overflows
	 * into the next AG range which can occur if we have inodes
	 * in the last block of the AG and we are currently
	 * pointing to the last inode.
	 */
	*first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
	if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
76
		return NULL;
77 78 79 80 81 82
	return ip;
}

STATIC int
xfs_inode_ag_walk(
	struct xfs_mount	*mp,
D
Dave Chinner 已提交
83
	struct xfs_perag	*pag,
84 85 86
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
	int			flags,
87
	int			tag,
88 89
	int			exclusive,
	int			*nr_to_scan)
90 91 92 93 94 95 96 97 98 99 100 101
{
	uint32_t		first_index;
	int			last_error = 0;
	int			skipped;

restart:
	skipped = 0;
	first_index = 0;
	do {
		int		error = 0;
		xfs_inode_t	*ip;

102 103 104 105
		if (exclusive)
			write_lock(&pag->pag_ici_lock);
		else
			read_lock(&pag->pag_ici_lock);
106
		ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag);
107 108 109 110 111
		if (!ip) {
			if (exclusive)
				write_unlock(&pag->pag_ici_lock);
			else
				read_unlock(&pag->pag_ici_lock);
112
			break;
113
		}
114

115
		/* execute releases pag->pag_ici_lock */
116 117 118 119 120 121 122
		error = execute(ip, pag, flags);
		if (error == EAGAIN) {
			skipped++;
			continue;
		}
		if (error)
			last_error = error;
123 124

		/* bail out if the filesystem is corrupted.  */
125 126 127
		if (error == EFSCORRUPTED)
			break;

128
	} while ((*nr_to_scan)--);
129 130 131 132 133 134 135 136

	if (skipped) {
		delay(1);
		goto restart;
	}
	return last_error;
}

137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
/*
 * Select the next per-ag structure to iterate during the walk. The reclaim
 * walk is optimised only to walk AGs with reclaimable inodes in them.
 */
static struct xfs_perag *
xfs_inode_ag_iter_next_pag(
	struct xfs_mount	*mp,
	xfs_agnumber_t		*first,
	int			tag)
{
	struct xfs_perag	*pag = NULL;

	if (tag == XFS_ICI_RECLAIM_TAG) {
		int found;
		int ref;

		spin_lock(&mp->m_perag_lock);
		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
				(void **)&pag, *first, 1, tag);
		if (found <= 0) {
			spin_unlock(&mp->m_perag_lock);
			return NULL;
		}
		*first = pag->pag_agno + 1;
		/* open coded pag reference increment */
		ref = atomic_inc_return(&pag->pag_ref);
		spin_unlock(&mp->m_perag_lock);
		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
	} else {
		pag = xfs_perag_get(mp, *first);
		(*first)++;
	}
	return pag;
}

172
int
173 174 175 176 177
xfs_inode_ag_iterator(
	struct xfs_mount	*mp,
	int			(*execute)(struct xfs_inode *ip,
					   struct xfs_perag *pag, int flags),
	int			flags,
178
	int			tag,
179 180
	int			exclusive,
	int			*nr_to_scan)
181
{
182
	struct xfs_perag	*pag;
183 184 185
	int			error = 0;
	int			last_error = 0;
	xfs_agnumber_t		ag;
186
	int			nr;
187

188
	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
189 190
	ag = 0;
	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
D
Dave Chinner 已提交
191
		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
192
						exclusive, &nr);
D
Dave Chinner 已提交
193
		xfs_perag_put(pag);
194 195 196 197 198
		if (error) {
			last_error = error;
			if (error == EFSCORRUPTED)
				break;
		}
199 200
		if (nr <= 0)
			break;
201
	}
202 203
	if (nr_to_scan)
		*nr_to_scan = nr;
204 205 206
	return XFS_ERROR(last_error);
}

207
/* must be called with pag_ici_lock held and releases it */
208
int
209 210 211 212 213
xfs_sync_inode_valid(
	struct xfs_inode	*ip,
	struct xfs_perag	*pag)
{
	struct inode		*inode = VFS_I(ip);
214
	int			error = EFSCORRUPTED;
215 216

	/* nothing to sync during shutdown */
217 218
	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
		goto out_unlock;
219

220 221 222 223
	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
	error = ENOENT;
	if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
		goto out_unlock;
224

225 226 227 228 229
	/* If we can't grab the inode, it must on it's way to reclaim. */
	if (!igrab(inode))
		goto out_unlock;

	if (is_bad_inode(inode)) {
230
		IRELE(ip);
231
		goto out_unlock;
232 233
	}

234 235 236 237 238
	/* inode is valid */
	error = 0;
out_unlock:
	read_unlock(&pag->pag_ici_lock);
	return error;
239 240
}

241 242 243
STATIC int
xfs_sync_inode_data(
	struct xfs_inode	*ip,
244
	struct xfs_perag	*pag,
245 246 247 248 249 250
	int			flags)
{
	struct inode		*inode = VFS_I(ip);
	struct address_space *mapping = inode->i_mapping;
	int			error = 0;

251 252 253 254
	error = xfs_sync_inode_valid(ip, pag);
	if (error)
		return error;

255 256 257 258 259 260 261 262 263 264
	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
		goto out_wait;

	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
		if (flags & SYNC_TRYLOCK)
			goto out_wait;
		xfs_ilock(ip, XFS_IOLOCK_SHARED);
	}

	error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
265
				0 : XBF_ASYNC, FI_NONE);
266 267 268
	xfs_iunlock(ip, XFS_IOLOCK_SHARED);

 out_wait:
C
Christoph Hellwig 已提交
269
	if (flags & SYNC_WAIT)
270
		xfs_ioend_wait(ip);
271
	IRELE(ip);
272 273 274
	return error;
}

275 276 277
STATIC int
xfs_sync_inode_attr(
	struct xfs_inode	*ip,
278
	struct xfs_perag	*pag,
279 280 281 282
	int			flags)
{
	int			error = 0;

283 284 285 286
	error = xfs_sync_inode_valid(ip, pag);
	if (error)
		return error;

287 288 289 290 291 292 293 294 295 296 297 298 299 300
	xfs_ilock(ip, XFS_ILOCK_SHARED);
	if (xfs_inode_clean(ip))
		goto out_unlock;
	if (!xfs_iflock_nowait(ip)) {
		if (!(flags & SYNC_WAIT))
			goto out_unlock;
		xfs_iflock(ip);
	}

	if (xfs_inode_clean(ip)) {
		xfs_ifunlock(ip);
		goto out_unlock;
	}

301
	error = xfs_iflush(ip, flags);
302 303 304

 out_unlock:
	xfs_iunlock(ip, XFS_ILOCK_SHARED);
305
	IRELE(ip);
306 307 308
	return error;
}

C
Christoph Hellwig 已提交
309 310 311
/*
 * Write out pagecache data for the whole filesystem.
 */
312
STATIC int
C
Christoph Hellwig 已提交
313 314 315
xfs_sync_data(
	struct xfs_mount	*mp,
	int			flags)
316
{
C
Christoph Hellwig 已提交
317
	int			error;
318

C
Christoph Hellwig 已提交
319
	ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
320

C
Christoph Hellwig 已提交
321
	error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags,
322
				      XFS_ICI_NO_TAG, 0, NULL);
C
Christoph Hellwig 已提交
323 324
	if (error)
		return XFS_ERROR(error);
325

326
	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
C
Christoph Hellwig 已提交
327 328
	return 0;
}
329

C
Christoph Hellwig 已提交
330 331 332
/*
 * Write out inode metadata (attributes) for the whole filesystem.
 */
333
STATIC int
C
Christoph Hellwig 已提交
334 335 336 337 338
xfs_sync_attr(
	struct xfs_mount	*mp,
	int			flags)
{
	ASSERT((flags & ~SYNC_WAIT) == 0);
339

C
Christoph Hellwig 已提交
340
	return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags,
341
				     XFS_ICI_NO_TAG, 0, NULL);
342 343
}

344
STATIC int
345
xfs_sync_fsdata(
346
	struct xfs_mount	*mp)
347 348 349 350
{
	struct xfs_buf		*bp;

	/*
351 352 353 354 355 356
	 * If the buffer is pinned then push on the log so we won't get stuck
	 * waiting in the write for someone, maybe ourselves, to flush the log.
	 *
	 * Even though we just pushed the log above, we did not have the
	 * superblock buffer locked at that point so it can become pinned in
	 * between there and here.
357
	 */
358 359 360
	bp = xfs_getsb(mp, 0);
	if (XFS_BUF_ISPINNED(bp))
		xfs_log_force(mp, 0);
361

362
	return xfs_bwrite(mp, bp);
363 364 365
}

/*
D
David Chinner 已提交
366 367 368 369 370 371 372 373 374 375 376
 * When remounting a filesystem read-only or freezing the filesystem, we have
 * two phases to execute. This first phase is syncing the data before we
 * quiesce the filesystem, and the second is flushing all the inodes out after
 * we've waited for all the transactions created by the first phase to
 * complete. The second phase ensures that the inodes are written to their
 * location on disk rather than just existing in transactions in the log. This
 * means after a quiesce there is no log replay required to write the inodes to
 * disk (this is the main difference between a sync and a quiesce).
 */
/*
 * First stage of freeze - no writers will make progress now we are here,
377 378
 * so we flush delwri and delalloc buffers here, then wait for all I/O to
 * complete.  Data is frozen at that point. Metadata is not frozen,
D
David Chinner 已提交
379 380
 * transactions can still occur here so don't bother flushing the buftarg
 * because it'll just get dirty again.
381 382 383 384 385
 */
int
xfs_quiesce_data(
	struct xfs_mount	*mp)
{
386
	int			error, error2 = 0;
387 388

	/* push non-blocking */
C
Christoph Hellwig 已提交
389
	xfs_sync_data(mp, 0);
C
Christoph Hellwig 已提交
390
	xfs_qm_sync(mp, SYNC_TRYLOCK);
391

D
Dave Chinner 已提交
392
	/* push and block till complete */
C
Christoph Hellwig 已提交
393
	xfs_sync_data(mp, SYNC_WAIT);
C
Christoph Hellwig 已提交
394
	xfs_qm_sync(mp, SYNC_WAIT);
395

D
David Chinner 已提交
396
	/* write superblock and hoover up shutdown errors */
397 398 399 400 401 402 403
	error = xfs_sync_fsdata(mp);

	/* make sure all delwri buffers are written out */
	xfs_flush_buftarg(mp->m_ddev_targp, 1);

	/* mark the log as covered if needed */
	if (xfs_log_need_covered(mp))
404
		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
405

D
David Chinner 已提交
406
	/* flush data-only devices */
407 408 409
	if (mp->m_rtdev_targp)
		XFS_bflush(mp->m_rtdev_targp);

410
	return error ? error : error2;
411 412
}

D
David Chinner 已提交
413 414 415 416 417 418
STATIC void
xfs_quiesce_fs(
	struct xfs_mount	*mp)
{
	int	count = 0, pincount;

419
	xfs_reclaim_inodes(mp, 0);
D
David Chinner 已提交
420 421 422 423 424 425
	xfs_flush_buftarg(mp->m_ddev_targp, 0);

	/*
	 * This loop must run at least twice.  The first instance of the loop
	 * will flush most meta data but that will generate more meta data
	 * (typically directory updates).  Which then must be flushed and
426 427
	 * logged before we can write the unmount record. We also so sync
	 * reclaim of inodes to catch any that the above delwri flush skipped.
D
David Chinner 已提交
428 429
	 */
	do {
430
		xfs_reclaim_inodes(mp, SYNC_WAIT);
C
Christoph Hellwig 已提交
431
		xfs_sync_attr(mp, SYNC_WAIT);
D
David Chinner 已提交
432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
		pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
		if (!pincount) {
			delay(50);
			count++;
		}
	} while (count < 2);
}

/*
 * Second stage of a quiesce. The data is already synced, now we have to take
 * care of the metadata. New transactions are already blocked, so we need to
 * wait for any remaining transactions to drain out before proceding.
 */
void
xfs_quiesce_attr(
	struct xfs_mount	*mp)
{
	int	error = 0;

	/* wait for all modifications to complete */
	while (atomic_read(&mp->m_active_trans) > 0)
		delay(100);

	/* flush inodes and push all remaining buffers out to disk */
	xfs_quiesce_fs(mp);

458 459 460 461 462
	/*
	 * Just warn here till VFS can correctly support
	 * read-only remount without racing.
	 */
	WARN_ON(atomic_read(&mp->m_active_trans) != 0);
D
David Chinner 已提交
463 464 465 466 467 468 469 470 471 472 473

	/* Push the superblock and write an unmount record */
	error = xfs_log_sbcount(mp, 1);
	if (error)
		xfs_fs_cmn_err(CE_WARN, mp,
				"xfs_attr_quiesce: failed to log sb changes. "
				"Frozen image may not be consistent.");
	xfs_log_unmount_write(mp);
	xfs_unmountfs_writesb(mp);
}

474 475 476 477 478 479 480 481 482 483 484
/*
 * Enqueue a work item to be picked up by the vfs xfssyncd thread.
 * Doing this has two advantages:
 * - It saves on stack space, which is tight in certain situations
 * - It can be used (with care) as a mechanism to avoid deadlocks.
 * Flushing while allocating in a full filesystem requires both.
 */
STATIC void
xfs_syncd_queue_work(
	struct xfs_mount *mp,
	void		*data,
485 486
	void		(*syncer)(struct xfs_mount *, void *),
	struct completion *completion)
487
{
488
	struct xfs_sync_work *work;
489

490
	work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP);
491 492 493 494
	INIT_LIST_HEAD(&work->w_list);
	work->w_syncer = syncer;
	work->w_data = data;
	work->w_mount = mp;
495
	work->w_completion = completion;
496 497 498 499 500 501 502 503 504 505 506 507 508
	spin_lock(&mp->m_sync_lock);
	list_add_tail(&work->w_list, &mp->m_sync_list);
	spin_unlock(&mp->m_sync_lock);
	wake_up_process(mp->m_sync_task);
}

/*
 * Flush delayed allocate data, attempting to free up reserved space
 * from existing allocations.  At this point a new allocation attempt
 * has failed with ENOSPC and we are in the process of scratching our
 * heads, looking about for more room...
 */
STATIC void
509
xfs_flush_inodes_work(
510 511 512 513
	struct xfs_mount *mp,
	void		*arg)
{
	struct inode	*inode = arg;
C
Christoph Hellwig 已提交
514
	xfs_sync_data(mp, SYNC_TRYLOCK);
C
Christoph Hellwig 已提交
515
	xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
516 517 518 519
	iput(inode);
}

void
520
xfs_flush_inodes(
521 522 523
	xfs_inode_t	*ip)
{
	struct inode	*inode = VFS_I(ip);
524
	DECLARE_COMPLETION_ONSTACK(completion);
525 526

	igrab(inode);
527 528
	xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion);
	wait_for_completion(&completion);
529
	xfs_log_force(ip->i_mount, XFS_LOG_SYNC);
530 531
}

532
/*
533 534
 * Every sync period we need to unpin all items, reclaim inodes and sync
 * disk quotas.  We might need to cover the log to indicate that the
535
 * filesystem is idle and not frozen.
536
 */
537 538 539 540 541 542 543
STATIC void
xfs_sync_worker(
	struct xfs_mount *mp,
	void		*unused)
{
	int		error;

544
	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
545
		xfs_log_force(mp, 0);
546
		xfs_reclaim_inodes(mp, 0);
547
		/* dgc: errors ignored here */
C
Christoph Hellwig 已提交
548
		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
549 550 551
		if (mp->m_super->s_frozen == SB_UNFROZEN &&
		    xfs_log_need_covered(mp))
			error = xfs_fs_log_dummy(mp, 0);
552
	}
553 554 555 556 557 558 559 560 561 562
	mp->m_sync_seq++;
	wake_up(&mp->m_wait_single_sync_task);
}

STATIC int
xfssyncd(
	void			*arg)
{
	struct xfs_mount	*mp = arg;
	long			timeleft;
563
	xfs_sync_work_t		*work, *n;
564 565 566 567 568
	LIST_HEAD		(tmp);

	set_freezable();
	timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
	for (;;) {
569 570
		if (list_empty(&mp->m_sync_list))
			timeleft = schedule_timeout_interruptible(timeleft);
571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589
		/* swsusp */
		try_to_freeze();
		if (kthread_should_stop() && list_empty(&mp->m_sync_list))
			break;

		spin_lock(&mp->m_sync_lock);
		/*
		 * We can get woken by laptop mode, to do a sync -
		 * that's the (only!) case where the list would be
		 * empty with time remaining.
		 */
		if (!timeleft || list_empty(&mp->m_sync_list)) {
			if (!timeleft)
				timeleft = xfs_syncd_centisecs *
							msecs_to_jiffies(10);
			INIT_LIST_HEAD(&mp->m_sync_work.w_list);
			list_add_tail(&mp->m_sync_work.w_list,
					&mp->m_sync_list);
		}
590
		list_splice_init(&mp->m_sync_list, &tmp);
591 592 593 594 595 596 597
		spin_unlock(&mp->m_sync_lock);

		list_for_each_entry_safe(work, n, &tmp, w_list) {
			(*work->w_syncer)(mp, work->w_data);
			list_del(&work->w_list);
			if (work == &mp->m_sync_work)
				continue;
598 599
			if (work->w_completion)
				complete(work->w_completion);
600 601 602 603 604 605 606 607 608 609 610 611 612
			kmem_free(work);
		}
	}

	return 0;
}

int
xfs_syncd_init(
	struct xfs_mount	*mp)
{
	mp->m_sync_work.w_syncer = xfs_sync_worker;
	mp->m_sync_work.w_mount = mp;
613
	mp->m_sync_work.w_completion = NULL;
614
	mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname);
615 616 617 618 619 620 621 622 623 624 625 626
	if (IS_ERR(mp->m_sync_task))
		return -PTR_ERR(mp->m_sync_task);
	return 0;
}

void
xfs_syncd_stop(
	struct xfs_mount	*mp)
{
	kthread_stop(mp->m_sync_task);
}

627 628 629 630 631 632 633 634
void
__xfs_inode_set_reclaim_tag(
	struct xfs_perag	*pag,
	struct xfs_inode	*ip)
{
	radix_tree_tag_set(&pag->pag_ici_root,
			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
			   XFS_ICI_RECLAIM_TAG);
635 636 637 638 639 640 641 642 643 644 645

	if (!pag->pag_ici_reclaimable) {
		/* propagate the reclaim tag up into the perag radix tree */
		spin_lock(&ip->i_mount->m_perag_lock);
		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
				XFS_ICI_RECLAIM_TAG);
		spin_unlock(&ip->i_mount->m_perag_lock);
		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
							-1, _RET_IP_);
	}
646
	pag->pag_ici_reclaimable++;
647 648
}

D
David Chinner 已提交
649 650 651 652 653
/*
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
 * can go away.
 */
654 655 656 657
void
xfs_inode_set_reclaim_tag(
	xfs_inode_t	*ip)
{
D
Dave Chinner 已提交
658 659
	struct xfs_mount *mp = ip->i_mount;
	struct xfs_perag *pag;
660

D
Dave Chinner 已提交
661
	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
662
	write_lock(&pag->pag_ici_lock);
663
	spin_lock(&ip->i_flags_lock);
664
	__xfs_inode_set_reclaim_tag(pag, ip);
D
David Chinner 已提交
665
	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
666
	spin_unlock(&ip->i_flags_lock);
667
	write_unlock(&pag->pag_ici_lock);
D
Dave Chinner 已提交
668
	xfs_perag_put(pag);
669 670 671 672 673 674 675 676 677 678
}

void
__xfs_inode_clear_reclaim_tag(
	xfs_mount_t	*mp,
	xfs_perag_t	*pag,
	xfs_inode_t	*ip)
{
	radix_tree_tag_clear(&pag->pag_ici_root,
			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
679
	pag->pag_ici_reclaimable--;
680 681 682 683 684 685 686 687 688 689
	if (!pag->pag_ici_reclaimable) {
		/* clear the reclaim tag from the perag radix tree */
		spin_lock(&ip->i_mount->m_perag_lock);
		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
				XFS_ICI_RECLAIM_TAG);
		spin_unlock(&ip->i_mount->m_perag_lock);
		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
							-1, _RET_IP_);
	}
690 691
}

692 693 694 695 696 697 698 699 700 701 702 703 704
/*
 * Inodes in different states need to be treated differently, and the return
 * value of xfs_iflush is not sufficient to get this right. The following table
 * lists the inode states and the reclaim actions necessary for non-blocking
 * reclaim:
 *
 *
 *	inode state	     iflush ret		required action
 *      ---------------      ----------         ---------------
 *	bad			-		reclaim
 *	shutdown		EIO		unpin and reclaim
 *	clean, unpinned		0		reclaim
 *	stale, unpinned		0		reclaim
705 706 707 708 709
 *	clean, pinned(*)	0		requeue
 *	stale, pinned		EAGAIN		requeue
 *	dirty, delwri ok	0		requeue
 *	dirty, delwri blocked	EAGAIN		requeue
 *	dirty, sync flush	0		reclaim
710 711 712 713
 *
 * (*) dgc: I don't think the clean, pinned state is possible but it gets
 * handled anyway given the order of checks implemented.
 *
714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
 * As can be seen from the table, the return value of xfs_iflush() is not
 * sufficient to correctly decide the reclaim action here. The checks in
 * xfs_iflush() might look like duplicates, but they are not.
 *
 * Also, because we get the flush lock first, we know that any inode that has
 * been flushed delwri has had the flush completed by the time we check that
 * the inode is clean. The clean inode check needs to be done before flushing
 * the inode delwri otherwise we would loop forever requeuing clean inodes as
 * we cannot tell apart a successful delwri flush and a clean inode from the
 * return value of xfs_iflush().
 *
 * Note that because the inode is flushed delayed write by background
 * writeback, the flush lock may already be held here and waiting on it can
 * result in very long latencies. Hence for sync reclaims, where we wait on the
 * flush lock, the caller should push out delayed write inodes first before
 * trying to reclaim them to minimise the amount of time spent waiting. For
 * background relaim, we just requeue the inode for the next pass.
 *
732 733 734
 * Hence the order of actions after gaining the locks should be:
 *	bad		=> reclaim
 *	shutdown	=> unpin and reclaim
735 736
 *	pinned, delwri	=> requeue
 *	pinned, sync	=> unpin
737 738
 *	stale		=> reclaim
 *	clean		=> reclaim
739 740
 *	dirty, delwri	=> flush and requeue
 *	dirty, sync	=> flush, wait and reclaim
741
 */
742
STATIC int
743
xfs_reclaim_inode(
744 745
	struct xfs_inode	*ip,
	struct xfs_perag	*pag,
746
	int			sync_mode)
747
{
748
	int	error = 0;
749

750 751 752 753 754 755 756 757 758 759 760
	/*
	 * The radix tree lock here protects a thread in xfs_iget from racing
	 * with us starting reclaim on the inode.  Once we have the
	 * XFS_IRECLAIM flag set it will not touch us.
	 */
	spin_lock(&ip->i_flags_lock);
	ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE));
	if (__xfs_iflags_test(ip, XFS_IRECLAIM)) {
		/* ignore as it is already under reclaim */
		spin_unlock(&ip->i_flags_lock);
		write_unlock(&pag->pag_ici_lock);
761
		return 0;
762
	}
763 764 765 766 767
	__xfs_iflags_set(ip, XFS_IRECLAIM);
	spin_unlock(&ip->i_flags_lock);
	write_unlock(&pag->pag_ici_lock);

	xfs_ilock(ip, XFS_ILOCK_EXCL);
768 769 770 771 772
	if (!xfs_iflock_nowait(ip)) {
		if (!(sync_mode & SYNC_WAIT))
			goto out;
		xfs_iflock(ip);
	}
773

774 775 776 777 778 779
	if (is_bad_inode(VFS_I(ip)))
		goto reclaim;
	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
		xfs_iunpin_wait(ip);
		goto reclaim;
	}
780 781 782 783 784
	if (xfs_ipincount(ip)) {
		if (!(sync_mode & SYNC_WAIT)) {
			xfs_ifunlock(ip);
			goto out;
		}
785
		xfs_iunpin_wait(ip);
786
	}
787 788 789 790 791 792 793
	if (xfs_iflags_test(ip, XFS_ISTALE))
		goto reclaim;
	if (xfs_inode_clean(ip))
		goto reclaim;

	/* Now we have an inode that needs flushing */
	error = xfs_iflush(ip, sync_mode);
794 795 796
	if (sync_mode & SYNC_WAIT) {
		xfs_iflock(ip);
		goto reclaim;
797 798
	}

799 800 801 802 803 804
	/*
	 * When we have to flush an inode but don't have SYNC_WAIT set, we
	 * flush the inode out using a delwri buffer and wait for the next
	 * call into reclaim to find it in a clean state instead of waiting for
	 * it now. We also don't return errors here - if the error is transient
	 * then the next reclaim pass will flush the inode, and if the error
805
	 * is permanent then the next sync reclaim will reclaim the inode and
806 807
	 * pass on the error.
	 */
808
	if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
			"inode 0x%llx background reclaim flush failed with %d",
			(long long)ip->i_ino, error);
	}
out:
	xfs_iflags_clear(ip, XFS_IRECLAIM);
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
	/*
	 * We could return EAGAIN here to make reclaim rescan the inode tree in
	 * a short while. However, this just burns CPU time scanning the tree
	 * waiting for IO to complete and xfssyncd never goes back to the idle
	 * state. Instead, return 0 to let the next scheduled background reclaim
	 * attempt to reclaim the inode again.
	 */
	return 0;

825 826
reclaim:
	xfs_ifunlock(ip);
827
	xfs_iunlock(ip, XFS_ILOCK_EXCL);
828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857

	XFS_STATS_INC(xs_ig_reclaims);
	/*
	 * Remove the inode from the per-AG radix tree.
	 *
	 * Because radix_tree_delete won't complain even if the item was never
	 * added to the tree assert that it's been there before to catch
	 * problems with the inode life time early on.
	 */
	write_lock(&pag->pag_ici_lock);
	if (!radix_tree_delete(&pag->pag_ici_root,
				XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
		ASSERT(0);
	write_unlock(&pag->pag_ici_lock);

	/*
	 * Here we do an (almost) spurious inode lock in order to coordinate
	 * with inode cache radix tree lookups.  This is because the lookup
	 * can reference the inodes in the cache without taking references.
	 *
	 * We make that OK here by ensuring that we wait until the inode is
	 * unlocked after the lookup before we go ahead and free it.  We get
	 * both the ilock and the iolock because the code may need to drop the
	 * ilock one but will still hold the iolock.
	 */
	xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
	xfs_qm_dqdetach(ip);
	xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);

	xfs_inode_free(ip);
858 859
	return error;

860 861 862 863 864 865 866
}

int
xfs_reclaim_inodes(
	xfs_mount_t	*mp,
	int		mode)
{
867
	return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode,
868 869 870 871 872 873 874 875
					XFS_ICI_RECLAIM_TAG, 1, NULL);
}

/*
 * Shrinker infrastructure.
 */
static int
xfs_reclaim_inode_shrink(
876
	struct shrinker	*shrink,
877 878 879 880 881 882
	int		nr_to_scan,
	gfp_t		gfp_mask)
{
	struct xfs_mount *mp;
	struct xfs_perag *pag;
	xfs_agnumber_t	ag;
883
	int		reclaimable;
884

885
	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
886 887 888 889
	if (nr_to_scan) {
		if (!(gfp_mask & __GFP_FS))
			return -1;

890
		xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
891
					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
892 893 894 895
		/* if we don't exhaust the scan, don't bother coming back */
		if (nr_to_scan > 0)
			return -1;
       }
896

897 898 899 900
	reclaimable = 0;
	ag = 0;
	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
					XFS_ICI_RECLAIM_TAG))) {
901 902
		reclaimable += pag->pag_ici_reclaimable;
		xfs_perag_put(pag);
903 904 905 906 907 908 909 910
	}
	return reclaimable;
}

void
xfs_inode_shrinker_register(
	struct xfs_mount	*mp)
{
911 912 913
	mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
	mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
	register_shrinker(&mp->m_inode_shrink);
914 915 916 917 918 919
}

void
xfs_inode_shrinker_unregister(
	struct xfs_mount	*mp)
{
920
	unregister_shrinker(&mp->m_inode_shrink);
921
}