xfs_defer.c 26.7 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0+
2 3 4 5 6 7 8 9 10 11 12 13 14
/*
 * Copyright (C) 2016 Oracle.  All Rights Reserved.
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 */
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_defer.h"
#include "xfs_trans.h"
15
#include "xfs_buf_item.h"
16 17
#include "xfs_inode.h"
#include "xfs_inode_item.h"
18
#include "xfs_trace.h"
19
#include "xfs_icache.h"
20
#include "xfs_log.h"
21 22 23
#include "xfs_rmap.h"
#include "xfs_refcount.h"
#include "xfs_bmap.h"
24
#include "xfs_alloc.h"
25
#include "xfs_buf.h"
26 27

static struct kmem_cache	*xfs_defer_pending_cache;
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79

/*
 * Deferred Operations in XFS
 *
 * Due to the way locking rules work in XFS, certain transactions (block
 * mapping and unmapping, typically) have permanent reservations so that
 * we can roll the transaction to adhere to AG locking order rules and
 * to unlock buffers between metadata updates.  Prior to rmap/reflink,
 * the mapping code had a mechanism to perform these deferrals for
 * extents that were going to be freed; this code makes that facility
 * more generic.
 *
 * When adding the reverse mapping and reflink features, it became
 * necessary to perform complex remapping multi-transactions to comply
 * with AG locking order rules, and to be able to spread a single
 * refcount update operation (an operation on an n-block extent can
 * update as many as n records!) among multiple transactions.  XFS can
 * roll a transaction to facilitate this, but using this facility
 * requires us to log "intent" items in case log recovery needs to
 * redo the operation, and to log "done" items to indicate that redo
 * is not necessary.
 *
 * Deferred work is tracked in xfs_defer_pending items.  Each pending
 * item tracks one type of deferred work.  Incoming work items (which
 * have not yet had an intent logged) are attached to a pending item
 * on the dop_intake list, where they wait for the caller to finish
 * the deferred operations.
 *
 * Finishing a set of deferred operations is an involved process.  To
 * start, we define "rolling a deferred-op transaction" as follows:
 *
 * > For each xfs_defer_pending item on the dop_intake list,
 *   - Sort the work items in AG order.  XFS locking
 *     order rules require us to lock buffers in AG order.
 *   - Create a log intent item for that type.
 *   - Attach it to the pending item.
 *   - Move the pending item from the dop_intake list to the
 *     dop_pending list.
 * > Roll the transaction.
 *
 * NOTE: To avoid exceeding the transaction reservation, we limit the
 * number of items that we attach to a given xfs_defer_pending.
 *
 * The actual finishing process looks like this:
 *
 * > For each xfs_defer_pending in the dop_pending list,
 *   - Roll the deferred-op transaction as above.
 *   - Create a log done item for that type, and attach it to the
 *     log intent item.
 *   - For each work item attached to the log intent item,
 *     * Perform the described action.
 *     * Attach the work item to the log done item.
80 81 82 83
 *     * If the result of doing the work was -EAGAIN, ->finish work
 *       wants a new transaction.  See the "Requesting a Fresh
 *       Transaction while Finishing Deferred Work" section below for
 *       details.
84 85 86 87 88 89 90
 *
 * The key here is that we must log an intent item for all pending
 * work items every time we roll the transaction, and that we must log
 * a done item as soon as the work is completed.  With this mechanism
 * we can perform complex remapping operations, chaining intent items
 * as needed.
 *
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
 * Requesting a Fresh Transaction while Finishing Deferred Work
 *
 * If ->finish_item decides that it needs a fresh transaction to
 * finish the work, it must ask its caller (xfs_defer_finish) for a
 * continuation.  The most likely cause of this circumstance are the
 * refcount adjust functions deciding that they've logged enough items
 * to be at risk of exceeding the transaction reservation.
 *
 * To get a fresh transaction, we want to log the existing log done
 * item to prevent the log intent item from replaying, immediately log
 * a new log intent item with the unfinished work items, roll the
 * transaction, and re-call ->finish_item wherever it left off.  The
 * log done item and the new log intent item must be in the same
 * transaction or atomicity cannot be guaranteed; defer_finish ensures
 * that this happens.
 *
 * This requires some coordination between ->finish_item and
 * defer_finish.  Upon deciding to request a new transaction,
 * ->finish_item should update the current work item to reflect the
 * unfinished work.  Next, it should reset the log done item's list
 * count to the number of items finished, and return -EAGAIN.
 * defer_finish sees the -EAGAIN, logs the new log intent item
 * with the remaining work items, and leaves the xfs_defer_pending
 * item at the head of the dop_work queue.  Then it rolls the
 * transaction and picks up processing where it left off.  It is
 * required that ->finish_item must be careful to leave enough
 * transaction reservation to fit the new log intent item.
 *
119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
 * This is an example of remapping the extent (E, E+B) into file X at
 * offset A and dealing with the extent (C, C+B) already being mapped
 * there:
 * +-------------------------------------------------+
 * | Unmap file X startblock C offset A length B     | t0
 * | Intent to reduce refcount for extent (C, B)     |
 * | Intent to remove rmap (X, C, A, B)              |
 * | Intent to free extent (D, 1) (bmbt block)       |
 * | Intent to map (X, A, B) at startblock E         |
 * +-------------------------------------------------+
 * | Map file X startblock E offset A length B       | t1
 * | Done mapping (X, E, A, B)                       |
 * | Intent to increase refcount for extent (E, B)   |
 * | Intent to add rmap (X, E, A, B)                 |
 * +-------------------------------------------------+
 * | Reduce refcount for extent (C, B)               | t2
135 136 137 138 139 140
 * | Done reducing refcount for extent (C, 9)        |
 * | Intent to reduce refcount for extent (C+9, B-9) |
 * | (ran out of space after 9 refcount updates)     |
 * +-------------------------------------------------+
 * | Reduce refcount for extent (C+9, B+9)           | t3
 * | Done reducing refcount for extent (C+9, B-9)    |
141 142 143 144 145 146
 * | Increase refcount for extent (E, B)             |
 * | Done increasing refcount for extent (E, B)      |
 * | Intent to free extent (C, B)                    |
 * | Intent to free extent (F, 1) (refcountbt block) |
 * | Intent to remove rmap (F, 1, REFC)              |
 * +-------------------------------------------------+
147
 * | Remove rmap (X, C, A, B)                        | t4
148 149 150 151 152 153
 * | Done removing rmap (X, C, A, B)                 |
 * | Add rmap (X, E, A, B)                           |
 * | Done adding rmap (X, E, A, B)                   |
 * | Remove rmap (F, 1, REFC)                        |
 * | Done removing rmap (F, 1, REFC)                 |
 * +-------------------------------------------------+
154
 * | Free extent (C, B)                              | t5
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176
 * | Done freeing extent (C, B)                      |
 * | Free extent (D, 1)                              |
 * | Done freeing extent (D, 1)                      |
 * | Free extent (F, 1)                              |
 * | Done freeing extent (F, 1)                      |
 * +-------------------------------------------------+
 *
 * If we should crash before t2 commits, log recovery replays
 * the following intent items:
 *
 * - Intent to reduce refcount for extent (C, B)
 * - Intent to remove rmap (X, C, A, B)
 * - Intent to free extent (D, 1) (bmbt block)
 * - Intent to increase refcount for extent (E, B)
 * - Intent to add rmap (X, E, A, B)
 *
 * In the process of recovering, it should also generate and take care
 * of these intent items:
 *
 * - Intent to free extent (C, B)
 * - Intent to free extent (F, 1) (refcountbt block)
 * - Intent to remove rmap (F, 1, REFC)
177 178 179
 *
 * Note that the continuation requested between t2 and t3 is likely to
 * reoccur.
180 181
 */

182 183 184 185 186 187 188
static const struct xfs_defer_op_type *defer_op_types[] = {
	[XFS_DEFER_OPS_TYPE_BMAP]	= &xfs_bmap_update_defer_type,
	[XFS_DEFER_OPS_TYPE_REFCOUNT]	= &xfs_refcount_update_defer_type,
	[XFS_DEFER_OPS_TYPE_RMAP]	= &xfs_rmap_update_defer_type,
	[XFS_DEFER_OPS_TYPE_FREE]	= &xfs_extent_free_defer_type,
	[XFS_DEFER_OPS_TYPE_AGFL_FREE]	= &xfs_agfl_free_defer_type,
};
189

190
static bool
191 192 193 194 195 196 197
xfs_defer_create_intent(
	struct xfs_trans		*tp,
	struct xfs_defer_pending	*dfp,
	bool				sort)
{
	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];

198 199 200
	if (!dfp->dfp_intent)
		dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
						     dfp->dfp_count, sort);
201
	return dfp->dfp_intent != NULL;
202 203
}

204 205 206 207 208
/*
 * For each pending item in the intake list, log its intent item and the
 * associated extents, then add the entire intake list to the end of
 * the pending list.
 */
209
static bool
210
xfs_defer_create_intents(
211
	struct xfs_trans		*tp)
212 213
{
	struct xfs_defer_pending	*dfp;
214
	bool				ret = false;
215

216
	list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
217
		trace_xfs_defer_create_intent(tp->t_mountp, dfp);
218
		ret |= xfs_defer_create_intent(tp, dfp, true);
219
	}
220
	return ret;
221 222 223 224 225 226
}

/* Abort all the intents that were committed. */
STATIC void
xfs_defer_trans_abort(
	struct xfs_trans		*tp,
227
	struct list_head		*dop_pending)
228 229
{
	struct xfs_defer_pending	*dfp;
230
	const struct xfs_defer_op_type	*ops;
231

232
	trace_xfs_defer_trans_abort(tp, _RET_IP_);
233

234
	/* Abort intent items that don't have a done item. */
235
	list_for_each_entry(dfp, dop_pending, dfp_list) {
236
		ops = defer_op_types[dfp->dfp_type];
237
		trace_xfs_defer_pending_abort(tp->t_mountp, dfp);
238
		if (dfp->dfp_intent && !dfp->dfp_done) {
239
			ops->abort_intent(dfp->dfp_intent);
240 241
			dfp->dfp_intent = NULL;
		}
242 243 244
	}
}

245 246 247 248 249 250 251 252
/*
 * Capture resources that the caller said not to release ("held") when the
 * transaction commits.  Caller is responsible for zero-initializing @dres.
 */
static int
xfs_defer_save_resources(
	struct xfs_defer_resources	*dres,
	struct xfs_trans		*tp)
253
{
254
	struct xfs_buf_log_item		*bli;
255
	struct xfs_inode_log_item	*ili;
256
	struct xfs_log_item		*lip;
257

258
	BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS);
259

260
	list_for_each_entry(lip, &tp->t_items, li_trans) {
261 262 263 264 265
		switch (lip->li_type) {
		case XFS_LI_BUF:
			bli = container_of(lip, struct xfs_buf_log_item,
					   bli_item);
			if (bli->bli_flags & XFS_BLI_HOLD) {
266
				if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) {
267 268 269
					ASSERT(0);
					return -EFSCORRUPTED;
				}
270
				if (bli->bli_flags & XFS_BLI_ORDERED)
271 272
					dres->dr_ordered |=
							(1U << dres->dr_bufs);
273 274
				else
					xfs_trans_dirty_buf(tp, bli->bli_buf);
275
				dres->dr_bp[dres->dr_bufs++] = bli->bli_buf;
276 277
			}
			break;
278 279 280 281
		case XFS_LI_INODE:
			ili = container_of(lip, struct xfs_inode_log_item,
					   ili_item);
			if (ili->ili_lock_flags == 0) {
282
				if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) {
283 284 285
					ASSERT(0);
					return -EFSCORRUPTED;
				}
286
				xfs_trans_log_inode(tp, ili->ili_inode,
287
						    XFS_ILOG_CORE);
288
				dres->dr_ip[dres->dr_inos++] = ili->ili_inode;
289 290
			}
			break;
291 292 293 294
		default:
			break;
		}
	}
295

296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332
	return 0;
}

/* Attach the held resources to the transaction. */
static void
xfs_defer_restore_resources(
	struct xfs_trans		*tp,
	struct xfs_defer_resources	*dres)
{
	unsigned short			i;

	/* Rejoin the joined inodes. */
	for (i = 0; i < dres->dr_inos; i++)
		xfs_trans_ijoin(tp, dres->dr_ip[i], 0);

	/* Rejoin the buffers and dirty them so the log moves forward. */
	for (i = 0; i < dres->dr_bufs; i++) {
		xfs_trans_bjoin(tp, dres->dr_bp[i]);
		if (dres->dr_ordered & (1U << i))
			xfs_trans_ordered_buf(tp, dres->dr_bp[i]);
		xfs_trans_bhold(tp, dres->dr_bp[i]);
	}
}

/* Roll a transaction so we can do some deferred op processing. */
STATIC int
xfs_defer_trans_roll(
	struct xfs_trans		**tpp)
{
	struct xfs_defer_resources	dres = { };
	int				error;

	error = xfs_defer_save_resources(&dres, *tpp);
	if (error)
		return error;

	trace_xfs_defer_trans_roll(*tpp, _RET_IP_);
333

334 335 336 337 338 339 340
	/*
	 * Roll the transaction.  Rolling always given a new transaction (even
	 * if committing the old one fails!) to hand back to the caller, so we
	 * join the held resources to the new transaction so that we always
	 * return with the held resources joined to @tpp, no matter what
	 * happened.
	 */
341
	error = xfs_trans_roll(tpp);
342

343
	xfs_defer_restore_resources(*tpp, &dres);
344

345
	if (error)
346
		trace_xfs_defer_trans_roll_error(*tpp, error);
347 348 349
	return error;
}

350 351 352 353 354 355 356 357 358 359 360 361
/*
 * Free up any items left in the list.
 */
static void
xfs_defer_cancel_list(
	struct xfs_mount		*mp,
	struct list_head		*dop_list)
{
	struct xfs_defer_pending	*dfp;
	struct xfs_defer_pending	*pli;
	struct list_head		*pwi;
	struct list_head		*n;
362
	const struct xfs_defer_op_type	*ops;
363 364 365 366 367 368

	/*
	 * Free the pending items.  Caller should already have arranged
	 * for the intent items to be released.
	 */
	list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
369
		ops = defer_op_types[dfp->dfp_type];
370 371 372 373 374
		trace_xfs_defer_cancel_list(mp, dfp);
		list_del(&dfp->dfp_list);
		list_for_each_safe(pwi, n, &dfp->dfp_work) {
			list_del(pwi);
			dfp->dfp_count--;
375
			ops->cancel_item(pwi);
376 377
		}
		ASSERT(dfp->dfp_count == 0);
378
		kmem_cache_free(xfs_defer_pending_cache, dfp);
379 380 381
	}
}

382 383 384 385 386 387 388 389 390 391
/*
 * Prevent a log intent item from pinning the tail of the log by logging a
 * done item to release the intent item; and then log a new intent item.
 * The caller should provide a fresh transaction and roll it after we're done.
 */
static int
xfs_defer_relog(
	struct xfs_trans		**tpp,
	struct list_head		*dfops)
{
392
	struct xlog			*log = (*tpp)->t_mountp->m_log;
393
	struct xfs_defer_pending	*dfp;
394 395
	xfs_lsn_t			threshold_lsn = NULLCOMMITLSN;

396 397 398 399 400 401 402 403 404 405 406 407 408 409 410

	ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);

	list_for_each_entry(dfp, dfops, dfp_list) {
		/*
		 * If the log intent item for this deferred op is not a part of
		 * the current log checkpoint, relog the intent item to keep
		 * the log tail moving forward.  We're ok with this being racy
		 * because an incorrect decision means we'll be a little slower
		 * at pushing the tail.
		 */
		if (dfp->dfp_intent == NULL ||
		    xfs_log_item_in_current_chkpt(dfp->dfp_intent))
			continue;

411 412 413 414 415 416 417 418 419 420 421 422 423
		/*
		 * Figure out where we need the tail to be in order to maintain
		 * the minimum required free space in the log.  Only sample
		 * the log threshold once per call.
		 */
		if (threshold_lsn == NULLCOMMITLSN) {
			threshold_lsn = xlog_grant_push_threshold(log, 0);
			if (threshold_lsn == NULLCOMMITLSN)
				break;
		}
		if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
			continue;

424 425 426 427 428 429 430 431 432 433
		trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
		XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
		dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
	}

	if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
		return xfs_defer_trans_roll(tpp);
	return 0;
}

434 435 436 437 438 439 440 441 442 443
/*
 * Log an intent-done item for the first pending intent, and finish the work
 * items.
 */
static int
xfs_defer_finish_one(
	struct xfs_trans		*tp,
	struct xfs_defer_pending	*dfp)
{
	const struct xfs_defer_op_type	*ops = defer_op_types[dfp->dfp_type];
444
	struct xfs_btree_cur		*state = NULL;
445 446 447 448 449 450 451 452 453
	struct list_head		*li, *n;
	int				error;

	trace_xfs_defer_pending_finish(tp->t_mountp, dfp);

	dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
	list_for_each_safe(li, n, &dfp->dfp_work) {
		list_del(li);
		dfp->dfp_count--;
454
		error = ops->finish_item(tp, dfp->dfp_done, li, &state);
455 456 457 458 459 460 461 462 463 464
		if (error == -EAGAIN) {
			/*
			 * Caller wants a fresh transaction; put the work item
			 * back on the list and log a new log intent item to
			 * replace the old one.  See "Requesting a Fresh
			 * Transaction while Finishing Deferred Work" above.
			 */
			list_add(li, &dfp->dfp_work);
			dfp->dfp_count++;
			dfp->dfp_done = NULL;
465
			dfp->dfp_intent = NULL;
466 467 468 469 470 471 472 473 474
			xfs_defer_create_intent(tp, dfp, false);
		}

		if (error)
			goto out;
	}

	/* Done with the dfp, free it. */
	list_del(&dfp->dfp_list);
475
	kmem_cache_free(xfs_defer_pending_cache, dfp);
476 477 478 479 480 481
out:
	if (ops->finish_cleanup)
		ops->finish_cleanup(tp, state, error);
	return error;
}

482 483 484 485 486 487 488 489 490
/*
 * Finish all the pending work.  This involves logging intent items for
 * any work items that wandered in since the last transaction roll (if
 * one has even happened), rolling the transaction, and finishing the
 * work items in the first item on the logged-and-pending list.
 *
 * If an inode is provided, relog it to the new transaction.
 */
int
491
xfs_defer_finish_noroll(
492
	struct xfs_trans		**tp)
493
{
494
	struct xfs_defer_pending	*dfp = NULL;
495
	int				error = 0;
496
	LIST_HEAD(dop_pending);
497 498 499

	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);

500
	trace_xfs_defer_finish(*tp, _RET_IP_);
501

502
	/* Until we run out of pending work to finish... */
503
	while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
504 505 506 507 508 509 510 511 512
		/*
		 * Deferred items that are created in the process of finishing
		 * other deferred work items should be queued at the head of
		 * the pending list, which puts them ahead of the deferred work
		 * that was created by the caller.  This keeps the number of
		 * pending work items to a minimum, which decreases the amount
		 * of time that any one intent item can stick around in memory,
		 * pinning the log tail.
		 */
513 514
		bool has_intents = xfs_defer_create_intents(*tp);

515
		list_splice_init(&(*tp)->t_dfops, &dop_pending);
516

517 518 519 520
		if (has_intents || dfp) {
			error = xfs_defer_trans_roll(tp);
			if (error)
				goto out_shutdown;
521

522 523 524 525 526
			/* Relog intent items to keep the log moving. */
			error = xfs_defer_relog(tp, &dop_pending);
			if (error)
				goto out_shutdown;
		}
527

528 529
		dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
				       dfp_list);
530 531 532
		error = xfs_defer_finish_one(*tp, dfp);
		if (error && error != -EAGAIN)
			goto out_shutdown;
533
	}
534

535
	trace_xfs_defer_finish_done(*tp, _RET_IP_);
536
	return 0;
537 538 539 540 541 542 543 544

out_shutdown:
	xfs_defer_trans_abort(*tp, &dop_pending);
	xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
	trace_xfs_defer_finish_error(*tp, error);
	xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
	xfs_defer_cancel(*tp);
	return error;
545 546
}

547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
int
xfs_defer_finish(
	struct xfs_trans	**tp)
{
	int			error;

	/*
	 * Finish and roll the transaction once more to avoid returning to the
	 * caller with a dirty transaction.
	 */
	error = xfs_defer_finish_noroll(tp);
	if (error)
		return error;
	if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
		error = xfs_defer_trans_roll(tp);
562 563 564
		if (error) {
			xfs_force_shutdown((*tp)->t_mountp,
					   SHUTDOWN_CORRUPT_INCORE);
565
			return error;
566
		}
567
	}
D
Darrick J. Wong 已提交
568 569 570 571

	/* Reset LOWMODE now that we've finished all the dfops. */
	ASSERT(list_empty(&(*tp)->t_dfops));
	(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
572 573 574
	return 0;
}

575
void
576
xfs_defer_cancel(
577
	struct xfs_trans	*tp)
578
{
579
	struct xfs_mount	*mp = tp->t_mountp;
580

581 582
	trace_xfs_defer_cancel(tp, _RET_IP_);
	xfs_defer_cancel_list(mp, &tp->t_dfops);
583 584 585 586 587
}

/* Add an item for later deferred processing. */
void
xfs_defer_add(
588
	struct xfs_trans		*tp,
589 590 591 592
	enum xfs_defer_ops_type		type,
	struct list_head		*li)
{
	struct xfs_defer_pending	*dfp = NULL;
593
	const struct xfs_defer_op_type	*ops;
594

595
	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
596
	BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
597

598 599 600 601 602
	/*
	 * Add the item to a pending item at the end of the intake list.
	 * If the last pending item has the same type, reuse it.  Else,
	 * create a new pending item at the end of the intake list.
	 */
603 604
	if (!list_empty(&tp->t_dfops)) {
		dfp = list_last_entry(&tp->t_dfops,
605
				struct xfs_defer_pending, dfp_list);
606 607 608
		ops = defer_op_types[dfp->dfp_type];
		if (dfp->dfp_type != type ||
		    (ops->max_items && dfp->dfp_count >= ops->max_items))
609 610 611
			dfp = NULL;
	}
	if (!dfp) {
612 613
		dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
				GFP_NOFS | __GFP_NOFAIL);
614
		dfp->dfp_type = type;
615
		dfp->dfp_intent = NULL;
616
		dfp->dfp_done = NULL;
617 618
		dfp->dfp_count = 0;
		INIT_LIST_HEAD(&dfp->dfp_work);
619
		list_add_tail(&dfp->dfp_list, &tp->t_dfops);
620 621 622 623 624 625
	}

	list_add_tail(li, &dfp->dfp_work);
	dfp->dfp_count++;
}

626
/*
627 628 629
 * Move deferred ops from one transaction to another and reset the source to
 * initial state. This is primarily used to carry state forward across
 * transaction rolls with pending dfops.
630 631 632
 */
void
xfs_defer_move(
633 634
	struct xfs_trans	*dtp,
	struct xfs_trans	*stp)
635
{
636
	list_splice_init(&stp->t_dfops, &dtp->t_dfops);
637

638 639 640 641 642 643 644
	/*
	 * Low free space mode was historically controlled by a dfops field.
	 * This meant that low mode state potentially carried across multiple
	 * transaction rolls. Transfer low mode on a dfops move to preserve
	 * that behavior.
	 */
	dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE);
D
Darrick J. Wong 已提交
645
	stp->t_flags &= ~XFS_TRANS_LOWMODE;
646
}
647 648 649 650 651 652 653 654 655

/*
 * Prepare a chain of fresh deferred ops work items to be completed later.  Log
 * recovery requires the ability to put off until later the actual finishing
 * work so that it can process unfinished items recovered from the log in
 * correct order.
 *
 * Create and log intent items for all the work that we're capturing so that we
 * can be assured that the items will get replayed if the system goes down
656 657 658 659
 * before log recovery gets a chance to finish the work it put off.  The entire
 * deferred ops state is transferred to the capture structure and the
 * transaction is then ready for the caller to commit it.  If there are no
 * intent items to capture, this function returns NULL.
660 661 662
 *
 * If capture_ip is not NULL, the capture structure will obtain an extra
 * reference to the inode.
663
 */
664 665
static struct xfs_defer_capture *
xfs_defer_ops_capture(
666
	struct xfs_trans		*tp)
667 668
{
	struct xfs_defer_capture	*dfc;
669 670
	unsigned short			i;
	int				error;
671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686

	if (list_empty(&tp->t_dfops))
		return NULL;

	/* Create an object to capture the defer ops. */
	dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
	INIT_LIST_HEAD(&dfc->dfc_list);
	INIT_LIST_HEAD(&dfc->dfc_dfops);

	xfs_defer_create_intents(tp);

	/* Move the dfops chain and transaction state to the capture struct. */
	list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
	dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
	tp->t_flags &= ~XFS_TRANS_LOWMODE;

687 688 689 690
	/* Capture the remaining block reservations along with the dfops. */
	dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
	dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;

691 692 693
	/* Preserve the log reservation size. */
	dfc->dfc_logres = tp->t_log_res;

694 695 696 697 698 699 700 701 702 703
	error = xfs_defer_save_resources(&dfc->dfc_held, tp);
	if (error) {
		/*
		 * Resource capture should never fail, but if it does, we
		 * still have to shut down the log and release things
		 * properly.
		 */
		xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE);
	}

704
	/*
705 706 707
	 * Grab extra references to the inodes and buffers because callers are
	 * expected to release their held references after we commit the
	 * transaction.
708
	 */
709 710 711
	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
		ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
712 713
	}

714 715 716
	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
		xfs_buf_hold(dfc->dfc_held.dr_bp[i]);

717 718 719 720
	return dfc;
}

/* Release all resources that we used to capture deferred ops. */
721
void
722
xfs_defer_ops_capture_free(
723 724 725
	struct xfs_mount		*mp,
	struct xfs_defer_capture	*dfc)
{
726 727
	unsigned short			i;

728
	xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
729 730 731 732 733 734 735

	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
		xfs_buf_relse(dfc->dfc_held.dr_bp[i]);

	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
		xfs_irele(dfc->dfc_held.dr_ip[i]);

736 737 738 739 740
	kmem_free(dfc);
}

/*
 * Capture any deferred ops and commit the transaction.  This is the last step
741 742 743 744 745
 * needed to finish a log intent item that we recovered from the log.  If any
 * of the deferred ops operate on an inode, the caller must pass in that inode
 * so that the reference can be transferred to the capture structure.  The
 * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
 * xfs_defer_ops_continue.
746 747 748 749 750 751 752 753 754 755 756
 */
int
xfs_defer_ops_capture_and_commit(
	struct xfs_trans		*tp,
	struct list_head		*capture_list)
{
	struct xfs_mount		*mp = tp->t_mountp;
	struct xfs_defer_capture	*dfc;
	int				error;

	/* If we don't capture anything, commit transaction and exit. */
757
	dfc = xfs_defer_ops_capture(tp);
758 759 760 761 762 763
	if (!dfc)
		return xfs_trans_commit(tp);

	/* Commit the transaction and add the capture structure to the list. */
	error = xfs_trans_commit(tp);
	if (error) {
764
		xfs_defer_ops_capture_free(mp, dfc);
765 766 767 768 769 770 771 772 773
		return error;
	}

	list_add_tail(&dfc->dfc_list, capture_list);
	return 0;
}

/*
 * Attach a chain of captured deferred ops to a new transaction and free the
774 775 776
 * capture structure.  If an inode was captured, it will be passed back to the
 * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
 * The caller now owns the inode reference.
777 778 779 780
 */
void
xfs_defer_ops_continue(
	struct xfs_defer_capture	*dfc,
781
	struct xfs_trans		*tp,
782
	struct xfs_defer_resources	*dres)
783
{
784 785
	unsigned int			i;

786 787 788
	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
	ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));

789
	/* Lock the captured resources to the new transaction. */
790 791 792 793 794
	if (dfc->dfc_held.dr_inos == 2)
		xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL,
				    dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL);
	else if (dfc->dfc_held.dr_inos == 1)
		xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL);
795 796 797 798 799

	for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
		xfs_buf_lock(dfc->dfc_held.dr_bp[i]);

	/* Join the captured resources to the new transaction. */
800 801
	xfs_defer_restore_resources(tp, &dfc->dfc_held);
	memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources));
802
	dres->dr_bufs = 0;
803

804 805 806 807 808
	/* Move captured dfops chain and state to the transaction. */
	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
	tp->t_flags |= dfc->dfc_tpflags;

	kmem_free(dfc);
809
}
810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832

/* Release the resources captured and continued during recovery. */
void
xfs_defer_resources_rele(
	struct xfs_defer_resources	*dres)
{
	unsigned short			i;

	for (i = 0; i < dres->dr_inos; i++) {
		xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL);
		xfs_irele(dres->dr_ip[i]);
		dres->dr_ip[i] = NULL;
	}

	for (i = 0; i < dres->dr_bufs; i++) {
		xfs_buf_relse(dres->dr_bp[i]);
		dres->dr_bp[i] = NULL;
	}

	dres->dr_inos = 0;
	dres->dr_bufs = 0;
	dres->dr_ordered = 0;
}
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866

static inline int __init
xfs_defer_init_cache(void)
{
	xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending",
			sizeof(struct xfs_defer_pending),
			0, 0, NULL);

	return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM;
}

static inline void
xfs_defer_destroy_cache(void)
{
	kmem_cache_destroy(xfs_defer_pending_cache);
	xfs_defer_pending_cache = NULL;
}

/* Set up caches for deferred work items. */
int __init
xfs_defer_init_item_caches(void)
{
	int				error;

	error = xfs_defer_init_cache();
	if (error)
		return error;
	error = xfs_rmap_intent_init_cache();
	if (error)
		goto err;
	error = xfs_refcount_intent_init_cache();
	if (error)
		goto err;
	error = xfs_bmap_intent_init_cache();
867 868 869
	if (error)
		goto err;
	error = xfs_extfree_intent_init_cache();
870 871 872 873 874 875 876 877 878 879 880 881 882
	if (error)
		goto err;

	return 0;
err:
	xfs_defer_destroy_item_caches();
	return error;
}

/* Destroy all the deferred work item caches, if they've been allocated. */
void
xfs_defer_destroy_item_caches(void)
{
883
	xfs_extfree_intent_destroy_cache();
884 885 886 887 888
	xfs_bmap_intent_destroy_cache();
	xfs_refcount_intent_destroy_cache();
	xfs_rmap_intent_destroy_cache();
	xfs_defer_destroy_cache();
}