xfs_fsops.c 14.0 KB
Newer Older
D
Dave Chinner 已提交
1
// SPDX-License-Identifier: GPL-2.0
L
Linus Torvalds 已提交
2
/*
3 4
 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
 * All Rights Reserved.
L
Linus Torvalds 已提交
5 6
 */
#include "xfs.h"
7
#include "xfs_fs.h"
8
#include "xfs_shared.h"
9
#include "xfs_format.h"
10
#include "xfs_log_format.h"
11
#include "xfs_trans_resv.h"
L
Linus Torvalds 已提交
12 13
#include "xfs_sb.h"
#include "xfs_mount.h"
14
#include "xfs_trans.h"
L
Linus Torvalds 已提交
15 16 17 18
#include "xfs_error.h"
#include "xfs_alloc.h"
#include "xfs_fsops.h"
#include "xfs_trans_space.h"
19
#include "xfs_log.h"
D
Dave Chinner 已提交
20
#include "xfs_ag.h"
21
#include "xfs_ag_resv.h"
L
Linus Torvalds 已提交
22 23

/*
D
Dave Chinner 已提交
24
 * growfs operations
L
Linus Torvalds 已提交
25 26 27 28 29 30 31
 */
static int
xfs_growfs_data_private(
	xfs_mount_t		*mp,		/* mount point for filesystem */
	xfs_growfs_data_t	*in)		/* growfs data input struct */
{
	xfs_buf_t		*bp;
32
	int			error;
L
Linus Torvalds 已提交
33 34 35 36 37 38
	xfs_agnumber_t		nagcount;
	xfs_agnumber_t		nagimax = 0;
	xfs_rfsblock_t		nb, nb_mod;
	xfs_rfsblock_t		new;
	xfs_agnumber_t		oagcount;
	xfs_trans_t		*tp;
39
	struct aghdr_init_data	id = {};
L
Linus Torvalds 已提交
40 41

	nb = in->newblocks;
42
	if (nb < mp->m_sb.sb_dblocks)
D
Dave Chinner 已提交
43
		return -EINVAL;
44 45
	if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb)))
		return error;
46
	error = xfs_buf_read_uncached(mp->m_ddev_targp,
47
				XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
48 49
				XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL);
	if (error)
50
		return error;
L
Linus Torvalds 已提交
51 52 53 54 55 56 57
	xfs_buf_relse(bp);

	new = nb;	/* use new as a temporary here */
	nb_mod = do_div(new, mp->m_sb.sb_agblocks);
	nagcount = new + (nb_mod != 0);
	if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
		nagcount--;
58
		nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
L
Linus Torvalds 已提交
59
		if (nb < mp->m_sb.sb_dblocks)
D
Dave Chinner 已提交
60
			return -EINVAL;
L
Linus Torvalds 已提交
61 62 63
	}
	new = nb - mp->m_sb.sb_dblocks;
	oagcount = mp->m_sb.sb_agcount;
64

65 66 67 68 69
	/* allocate the new per-ag structures */
	if (nagcount > oagcount) {
		error = xfs_initialize_perag(mp, nagcount, &nagimax);
		if (error)
			return error;
L
Linus Torvalds 已提交
70
	}
71

72 73 74
	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
	if (error)
L
Linus Torvalds 已提交
75 76
		return error;

77
	/*
78 79 80 81 82 83 84 85 86 87
	 * Write new AG headers to disk. Non-transactional, but need to be
	 * written and completed prior to the growfs transaction being logged.
	 * To do this, we use a delayed write buffer list and wait for
	 * submission and IO completion of the list as a whole. This allows the
	 * IO subsystem to merge all the AG headers in a single AG into a single
	 * IO and hide most of the latency of the IO from us.
	 *
	 * This also means that if we get an error whilst building the buffer
	 * list to write, we can cancel the entire list without having written
	 * anything.
88
	 */
89 90 91 92 93 94 95 96
	INIT_LIST_HEAD(&id.buffer_list);
	for (id.agno = nagcount - 1;
	     id.agno >= oagcount;
	     id.agno--, new -= id.agsize) {

		if (id.agno == nagcount - 1)
			id.agsize = nb -
				(id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks);
L
Linus Torvalds 已提交
97
		else
98
			id.agsize = mp->m_sb.sb_agblocks;
99

D
Dave Chinner 已提交
100
		error = xfs_ag_init_headers(mp, &id);
101
		if (error) {
102
			xfs_buf_delwri_cancel(&id.buffer_list);
103
			goto out_trans_cancel;
104
		}
L
Linus Torvalds 已提交
105
	}
106
	error = xfs_buf_delwri_submit(&id.buffer_list);
107
	if (error)
108
		goto out_trans_cancel;
109

110
	/* If there are new blocks in the old last AG, extend it. */
L
Linus Torvalds 已提交
111
	if (new) {
112
		error = xfs_ag_extend_space(mp, tp, &id, new);
113
		if (error)
114
			goto out_trans_cancel;
L
Linus Torvalds 已提交
115
	}
116 117 118 119 120 121

	/*
	 * Update changed superblock fields transactionally. These are not
	 * seen by the rest of the world until the transaction commit applies
	 * them atomically to the superblock.
	 */
L
Linus Torvalds 已提交
122 123 124 125 126
	if (nagcount > oagcount)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
	if (nb > mp->m_sb.sb_dblocks)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS,
				 nb - mp->m_sb.sb_dblocks);
127 128
	if (id.nfree)
		xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree);
129
	xfs_trans_set_sync(tp);
130
	error = xfs_trans_commit(tp);
131
	if (error)
L
Linus Torvalds 已提交
132
		return error;
133

L
Linus Torvalds 已提交
134 135 136
	/* New allocation groups fully initialized, so update mount struct */
	if (nagimax)
		mp->m_maxagi = nagimax;
137
	xfs_set_low_space_thresholds(mp);
138
	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
139

140 141 142 143 144 145 146
	/*
	 * If we expanded the last AG, free the per-AG reservation
	 * so we can reinitialize it with the new size.
	 */
	if (new) {
		struct xfs_perag	*pag;

147
		pag = xfs_perag_get(mp, id.agno);
148 149 150
		error = xfs_ag_resv_free(pag);
		xfs_perag_put(pag);
		if (error)
151
			return error;
152 153
	}

154 155 156 157 158
	/*
	 * Reserve AG metadata blocks. ENOSPC here does not mean there was a
	 * growfs failure, just that there still isn't space for new user data
	 * after the grow has been run.
	 */
159
	error = xfs_fs_reserve_ag_blocks(mp);
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
	if (error == -ENOSPC)
		error = 0;
	return error;

out_trans_cancel:
	xfs_trans_cancel(tp);
	return error;
}

static int
xfs_growfs_log_private(
	xfs_mount_t		*mp,	/* mount point for filesystem */
	xfs_growfs_log_t	*in)	/* growfs log input struct */
{
	xfs_extlen_t		nb;

	nb = in->newblocks;
	if (nb < XFS_MIN_LOG_BLOCKS || nb < XFS_B_TO_FSB(mp, XFS_MIN_LOG_BYTES))
		return -EINVAL;
	if (nb == mp->m_sb.sb_logblocks &&
	    in->isint == (mp->m_sb.sb_logstart != 0))
		return -EINVAL;
	/*
	 * Moving the log is hard, need new interfaces to sync
	 * the log first, hold off all activity while moving it.
	 * Can have shorter or longer log in the same space,
	 * or transform internal to external log or vice versa.
	 */
	return -ENOSYS;
}

static int
xfs_growfs_imaxpct(
	struct xfs_mount	*mp,
	__u32			imaxpct)
{
	struct xfs_trans	*tp;
	int			dpct;
	int			error;

	if (imaxpct > 100)
		return -EINVAL;

	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
			XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
	if (error)
		return error;

	dpct = imaxpct - mp->m_sb.sb_imax_pct;
	xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
	xfs_trans_set_sync(tp);
	return xfs_trans_commit(tp);
}

L
Linus Torvalds 已提交
214 215 216 217 218 219 220
/*
 * protected versions of growfs function acquire and release locks on the mount
 * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG,
 * XFS_IOC_FSGROWFSRT
 */
int
xfs_growfs_data(
221 222
	struct xfs_mount	*mp,
	struct xfs_growfs_data	*in)
L
Linus Torvalds 已提交
223
{
224
	int			error = 0;
225 226

	if (!capable(CAP_SYS_ADMIN))
D
Dave Chinner 已提交
227
		return -EPERM;
228
	if (!mutex_trylock(&mp->m_growlock))
D
Dave Chinner 已提交
229
		return -EWOULDBLOCK;
230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

	/* update imaxpct separately to the physical grow of the filesystem */
	if (in->imaxpct != mp->m_sb.sb_imax_pct) {
		error = xfs_growfs_imaxpct(mp, in->imaxpct);
		if (error)
			goto out_error;
	}

	if (in->newblocks != mp->m_sb.sb_dblocks) {
		error = xfs_growfs_data_private(mp, in);
		if (error)
			goto out_error;
	}

	/* Post growfs calculations needed to reflect new state in operations */
	if (mp->m_sb.sb_imax_pct) {
		uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct;
		do_div(icount, 100);
D
Darrick J. Wong 已提交
248
		M_IGEO(mp)->maxicount = XFS_FSB_TO_INO(mp, icount);
249
	} else
D
Darrick J. Wong 已提交
250
		M_IGEO(mp)->maxicount = 0;
251

252
	/* Update secondary superblocks now the physical grow has completed */
D
Dave Chinner 已提交
253
	error = xfs_update_secondary_sbs(mp);
254

255
out_error:
256 257 258 259 260 261
	/*
	 * Increment the generation unconditionally, the error could be from
	 * updating the secondary superblocks, in which case the new size
	 * is live already.
	 */
	mp->m_generation++;
262
	mutex_unlock(&mp->m_growlock);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269 270 271
	return error;
}

int
xfs_growfs_log(
	xfs_mount_t		*mp,
	xfs_growfs_log_t	*in)
{
	int error;
272 273

	if (!capable(CAP_SYS_ADMIN))
D
Dave Chinner 已提交
274
		return -EPERM;
275
	if (!mutex_trylock(&mp->m_growlock))
D
Dave Chinner 已提交
276
		return -EWOULDBLOCK;
L
Linus Torvalds 已提交
277
	error = xfs_growfs_log_private(mp, in);
278
	mutex_unlock(&mp->m_growlock);
L
Linus Torvalds 已提交
279 280 281 282 283 284 285
	return error;
}

/*
 * exported through ioctl XFS_IOC_FSCOUNTS
 */

286
void
L
Linus Torvalds 已提交
287 288 289 290
xfs_fs_counts(
	xfs_mount_t		*mp,
	xfs_fsop_counts_t	*cnt)
{
291
	cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
292
	cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
293
	cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
294
						mp->m_alloc_set_aside;
295

E
Eric Sandeen 已提交
296
	spin_lock(&mp->m_sb_lock);
L
Linus Torvalds 已提交
297
	cnt->freertx = mp->m_sb.sb_frextents;
E
Eric Sandeen 已提交
298
	spin_unlock(&mp->m_sb_lock);
L
Linus Torvalds 已提交
299 300 301 302 303 304 305
}

/*
 * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
 *
 * xfs_reserve_blocks is called to set m_resblks
 * in the in-core mount table. The number of unused reserved blocks
306
 * is kept in m_resblks_avail.
L
Linus Torvalds 已提交
307 308 309 310 311 312 313 314 315 316 317 318
 *
 * Reserve the requested number of blocks if available. Otherwise return
 * as many as possible to satisfy the request. The actual number
 * reserved are returned in outval
 *
 * A null inval pointer indicates that only the current reserved blocks
 * available  should  be returned no settings are changed.
 */

int
xfs_reserve_blocks(
	xfs_mount_t             *mp,
319
	uint64_t              *inval,
L
Linus Torvalds 已提交
320 321
	xfs_fsop_resblks_t      *outval)
{
322 323 324 325
	int64_t			lcounter, delta;
	int64_t			fdblks_delta = 0;
	uint64_t		request;
	int64_t			free;
326
	int			error = 0;
L
Linus Torvalds 已提交
327 328

	/* If inval is null, report current values and return */
329
	if (inval == (uint64_t *)NULL) {
330
		if (!outval)
D
Dave Chinner 已提交
331
			return -EINVAL;
L
Linus Torvalds 已提交
332 333
		outval->resblks = mp->m_resblks;
		outval->resblks_avail = mp->m_resblks_avail;
334
		return 0;
L
Linus Torvalds 已提交
335 336 337
	}

	request = *inval;
338 339

	/*
340 341 342
	 * With per-cpu counters, this becomes an interesting problem. we need
	 * to work out if we are freeing or allocation blocks first, then we can
	 * do the modification as necessary.
343
	 *
344 345 346 347
	 * We do this under the m_sb_lock so that if we are near ENOSPC, we will
	 * hold out any changes while we work out what to do. This means that
	 * the amount of free space can change while we do this, so we need to
	 * retry if we end up trying to reserve more space than is available.
348
	 */
E
Eric Sandeen 已提交
349
	spin_lock(&mp->m_sb_lock);
L
Linus Torvalds 已提交
350 351 352

	/*
	 * If our previous reservation was larger than the current value,
353 354 355
	 * then move any unused blocks back to the free pool. Modify the resblks
	 * counters directly since we shouldn't have any problems unreserving
	 * space.
L
Linus Torvalds 已提交
356 357 358 359
	 */
	if (mp->m_resblks > request) {
		lcounter = mp->m_resblks_avail - request;
		if (lcounter  > 0) {		/* release unused blocks */
360
			fdblks_delta = lcounter;
L
Linus Torvalds 已提交
361 362 363
			mp->m_resblks_avail -= lcounter;
		}
		mp->m_resblks = request;
364 365 366 367 368 369 370 371
		if (fdblks_delta) {
			spin_unlock(&mp->m_sb_lock);
			error = xfs_mod_fdblocks(mp, fdblks_delta, 0);
			spin_lock(&mp->m_sb_lock);
		}

		goto out;
	}
372

373 374 375 376
	/*
	 * If the request is larger than the current reservation, reserve the
	 * blocks before we update the reserve counters. Sample m_fdblocks and
	 * perform a partial reservation if the request exceeds free space.
377 378 379 380
	 *
	 * The code below estimates how many blocks it can request from
	 * fdblocks to stash in the reserve pool.  This is a classic TOCTOU
	 * race since fdblocks updates are not always coordinated via
381 382 383
	 * m_sb_lock.  Set the reserve size even if there's not enough free
	 * space to fill it because mod_fdblocks will refill an undersized
	 * reserve when it can.
384
	 */
385 386 387
	free = percpu_counter_sum(&mp->m_fdblocks) -
						xfs_fdblocks_unavailable(mp);
	delta = request - mp->m_resblks;
388
	mp->m_resblks = request;
389
	if (delta > 0 && free > 0) {
390
		/*
391
		 * We'll either succeed in getting space from the free block
392 393 394
		 * count or we'll get an ENOSPC.  Don't set the reserved flag
		 * here - we don't want to reserve the extra reserve blocks
		 * from the reserve.
395
		 */
396
		fdblks_delta = min(free, delta);
397 398 399 400
		spin_unlock(&mp->m_sb_lock);
		error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
		spin_lock(&mp->m_sb_lock);

401 402 403 404
		/*
		 * Update the reserve counters if blocks have been successfully
		 * allocated.
		 */
405
		if (!error)
406
			mp->m_resblks_avail += fdblks_delta;
407
	}
408 409 410 411 412 413 414 415
out:
	if (outval) {
		outval->resblks = mp->m_resblks;
		outval->resblks_avail = mp->m_resblks_avail;
	}

	spin_unlock(&mp->m_sb_lock);
	return error;
L
Linus Torvalds 已提交
416 417 418 419 420
}

int
xfs_fs_goingdown(
	xfs_mount_t	*mp,
421
	uint32_t	inflags)
L
Linus Torvalds 已提交
422 423 424
{
	switch (inflags) {
	case XFS_FSOP_GOING_FLAGS_DEFAULT: {
C
Christoph Hellwig 已提交
425
		struct super_block *sb = freeze_bdev(mp->m_super->s_bdev);
L
Linus Torvalds 已提交
426

427
		if (sb && !IS_ERR(sb)) {
428
			xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
L
Linus Torvalds 已提交
429 430
			thaw_bdev(sb->s_bdev, sb);
		}
431

L
Linus Torvalds 已提交
432 433 434
		break;
	}
	case XFS_FSOP_GOING_FLAGS_LOGFLUSH:
435
		xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
L
Linus Torvalds 已提交
436 437
		break;
	case XFS_FSOP_GOING_FLAGS_NOLOGFLUSH:
438 439
		xfs_force_shutdown(mp,
				SHUTDOWN_FORCE_UMOUNT | SHUTDOWN_LOG_IO_ERROR);
L
Linus Torvalds 已提交
440 441
		break;
	default:
D
Dave Chinner 已提交
442
		return -EINVAL;
L
Linus Torvalds 已提交
443 444 445 446
	}

	return 0;
}
447 448 449 450 451 452

/*
 * Force a shutdown of the filesystem instantly while keeping the filesystem
 * consistent. We don't do an unmount here; just shutdown the shop, make sure
 * that absolutely nothing persistent happens to this filesystem after this
 * point.
453 454 455 456 457
 *
 * The shutdown state change is atomic, resulting in the first and only the
 * first shutdown call processing the shutdown. This means we only shutdown the
 * log once as it requires, and we don't spam the logs when multiple concurrent
 * shutdowns race to set the shutdown flags.
458 459 460
 */
void
xfs_do_force_shutdown(
461
	struct xfs_mount *mp,
462 463 464 465
	int		flags,
	char		*fname,
	int		lnnum)
{
466 467
	int		tag;
	const char	*why;
468

469 470 471
	spin_lock(&mp->m_sb_lock);
	if (XFS_FORCED_SHUTDOWN(mp)) {
		spin_unlock(&mp->m_sb_lock);
472 473
		return;
	}
474 475 476 477 478 479 480
	mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
	if (mp->m_sb_bp)
		mp->m_sb_bp->b_flags |= XBF_DONE;
	spin_unlock(&mp->m_sb_lock);

	if (flags & SHUTDOWN_FORCE_UMOUNT)
		xfs_alert(mp, "User initiated shutdown received.");
481

482 483 484 485 486 487
	if (xlog_force_shutdown(mp->m_log, flags)) {
		tag = XFS_PTAG_SHUTDOWN_LOGERROR;
		why = "Log I/O Error";
	} else if (flags & SHUTDOWN_CORRUPT_INCORE) {
		tag = XFS_PTAG_SHUTDOWN_CORRUPT;
		why = "Corruption of in-memory data";
B
Brian Foster 已提交
488
	} else {
489 490
		tag = XFS_PTAG_SHUTDOWN_IOERROR;
		why = "Metadata I/O Error";
491
	}
492

493 494 495
	xfs_alert_tag(mp, tag,
"%s (0x%x) detected at %pS (%s:%d).  Shutting down filesystem.",
			why, flags, __return_address, fname, lnnum);
496 497
	xfs_alert(mp,
		"Please unmount the filesystem and rectify the problem(s)");
498 499
	if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
		xfs_stack_trace();
500
}
501 502 503 504 505 506 507 508 509 510 511 512 513

/*
 * Reserve free space for per-AG metadata.
 */
int
xfs_fs_reserve_ag_blocks(
	struct xfs_mount	*mp)
{
	xfs_agnumber_t		agno;
	struct xfs_perag	*pag;
	int			error = 0;
	int			err2;

514
	mp->m_finobt_nores = false;
515 516
	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
		pag = xfs_perag_get(mp, agno);
517
		err2 = xfs_ag_resv_init(pag, NULL);
518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557
		xfs_perag_put(pag);
		if (err2 && !error)
			error = err2;
	}

	if (error && error != -ENOSPC) {
		xfs_warn(mp,
	"Error %d reserving per-AG metadata reserve pool.", error);
		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
	}

	return error;
}

/*
 * Free space reserved for per-AG metadata.
 */
int
xfs_fs_unreserve_ag_blocks(
	struct xfs_mount	*mp)
{
	xfs_agnumber_t		agno;
	struct xfs_perag	*pag;
	int			error = 0;
	int			err2;

	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
		pag = xfs_perag_get(mp, agno);
		err2 = xfs_ag_resv_free(pag);
		xfs_perag_put(pag);
		if (err2 && !error)
			error = err2;
	}

	if (error)
		xfs_warn(mp,
	"Error %d freeing per-AG metadata reserve pool.", error);

	return error;
}