super.c 67.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

Y
Yan 已提交
6
#include <linux/blkdev.h>
7 8 9 10 11 12
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/init.h>
E
Eric Paris 已提交
13
#include <linux/seq_file.h>
14 15
#include <linux/string.h>
#include <linux/backing-dev.h>
Y
Yan 已提交
16
#include <linux/mount.h>
C
Chris Mason 已提交
17
#include <linux/writeback.h>
C
Chris Mason 已提交
18
#include <linux/statfs.h>
C
Chris Mason 已提交
19
#include <linux/compat.h>
20
#include <linux/parser.h>
21
#include <linux/ctype.h>
22
#include <linux/namei.h>
23
#include <linux/miscdevice.h>
24
#include <linux/magic.h>
25
#include <linux/slab.h>
D
Dan Magenheimer 已提交
26
#include <linux/cleancache.h>
27
#include <linux/ratelimit.h>
28
#include <linux/crc32c.h>
29
#include <linux/btrfs.h>
30
#include "delayed-inode.h"
31
#include "ctree.h"
C
Chris Mason 已提交
32
#include "disk-io.h"
33
#include "transaction.h"
C
Chris Mason 已提交
34
#include "btrfs_inode.h"
C
Chris Mason 已提交
35
#include "print-tree.h"
36
#include "props.h"
J
Josef Bacik 已提交
37
#include "xattr.h"
38
#include "volumes.h"
B
Balaji Rao 已提交
39
#include "export.h"
C
Chris Mason 已提交
40
#include "compression.h"
J
Josef Bacik 已提交
41
#include "rcu-string.h"
42
#include "dev-replace.h"
43
#include "free-space-cache.h"
44
#include "backref.h"
45
#include "space-info.h"
46
#include "sysfs.h"
47
#include "tests/btrfs-tests.h"
48
#include "block-group.h"
49
#include "discard.h"
50

51
#include "qgroup.h"
52 53 54
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>

55
static const struct super_operations btrfs_super_ops;
56 57 58 59 60 61

/*
 * Types for mounting the default subvolume and a subvolume explicitly
 * requested by subvol=/path. That way the callchain is straightforward and we
 * don't have to play tricks with the mount options and recursive calls to
 * btrfs_mount.
62 63
 *
 * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
64
 */
65
static struct file_system_type btrfs_fs_type;
66
static struct file_system_type btrfs_root_fs_type;
C
Chris Mason 已提交
67

68 69
static int btrfs_remount(struct super_block *sb, int *flags, char *data);

D
David Sterba 已提交
70
const char * __attribute_const__ btrfs_decode_error(int errno)
L
liubo 已提交
71
{
72
	char *errstr = "unknown";
L
liubo 已提交
73 74

	switch (errno) {
D
David Sterba 已提交
75 76 77 78
	case -ENOENT:		/* -2 */
		errstr = "No such entry";
		break;
	case -EIO:		/* -5 */
L
liubo 已提交
79 80
		errstr = "IO failure";
		break;
D
David Sterba 已提交
81
	case -ENOMEM:		/* -12*/
L
liubo 已提交
82 83
		errstr = "Out of memory";
		break;
D
David Sterba 已提交
84
	case -EEXIST:		/* -17 */
J
Jeff Mahoney 已提交
85 86
		errstr = "Object already exists";
		break;
D
David Sterba 已提交
87
	case -ENOSPC:		/* -28 */
88 89
		errstr = "No space left";
		break;
D
David Sterba 已提交
90 91
	case -EROFS:		/* -30 */
		errstr = "Readonly filesystem";
92
		break;
93 94 95 96 97 98 99 100 101
	case -EOPNOTSUPP:	/* -95 */
		errstr = "Operation not supported";
		break;
	case -EUCLEAN:		/* -117 */
		errstr = "Filesystem corrupted";
		break;
	case -EDQUOT:		/* -122 */
		errstr = "Quota exceeded";
		break;
L
liubo 已提交
102 103 104 105 106 107
	}

	return errstr;
}

/*
108
 * __btrfs_handle_fs_error decodes expected errors from the caller and
109
 * invokes the appropriate error response.
L
liubo 已提交
110
 */
111
__cold
112
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
J
Jeff Mahoney 已提交
113
		       unsigned int line, int errno, const char *fmt, ...)
L
liubo 已提交
114 115
{
	struct super_block *sb = fs_info->sb;
116
#ifdef CONFIG_PRINTK
L
liubo 已提交
117
	const char *errstr;
118
#endif
L
liubo 已提交
119 120 121

	/*
	 * Special case: if the error is EROFS, and we're already
122
	 * under SB_RDONLY, then it is safe here.
L
liubo 已提交
123
	 */
124
	if (errno == -EROFS && sb_rdonly(sb))
J
Jeff Mahoney 已提交
125 126
  		return;

127
#ifdef CONFIG_PRINTK
128
	errstr = btrfs_decode_error(errno);
J
Jeff Mahoney 已提交
129
	if (fmt) {
130 131 132 133 134 135
		struct va_format vaf;
		va_list args;

		va_start(args, fmt);
		vaf.fmt = fmt;
		vaf.va = &args;
J
Jeff Mahoney 已提交
136

137
		pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s (%pV)\n",
138
			sb->s_id, function, line, errno, errstr, &vaf);
139
		va_end(args);
J
Jeff Mahoney 已提交
140
	} else {
141
		pr_crit("BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
142
			sb->s_id, function, line, errno, errstr);
J
Jeff Mahoney 已提交
143
	}
144
#endif
L
liubo 已提交
145

A
Anand Jain 已提交
146 147 148 149 150 151
	/*
	 * Today we only save the error info to memory.  Long term we'll
	 * also send it down to the disk
	 */
	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);

J
Jeff Mahoney 已提交
152
	/* Don't go through full error handling during mount */
153 154 155 156 157 158
	if (!(sb->s_flags & SB_BORN))
		return;

	if (sb_rdonly(sb))
		return;

159 160
	btrfs_discard_stop(fs_info);

161 162 163 164 165 166 167 168
	/* btrfs handle error by forcing the filesystem readonly */
	sb->s_flags |= SB_RDONLY;
	btrfs_info(fs_info, "forced readonly");
	/*
	 * Note that a running device replace operation is not canceled here
	 * although there is no way to update the progress. It would add the
	 * risk of a deadlock, therefore the canceling is omitted. The only
	 * penalty is that some I/O remains active until the procedure
169
	 * completes. The next time when the filesystem is mounted writable
170 171
	 * again, the device replace operation continues.
	 */
J
Jeff Mahoney 已提交
172
}
L
liubo 已提交
173

174
#ifdef CONFIG_PRINTK
175
static const char * const logtypes[] = {
J
Jeff Mahoney 已提交
176 177 178 179 180 181 182 183 184 185
	"emergency",
	"alert",
	"critical",
	"error",
	"warning",
	"notice",
	"info",
	"debug",
};

186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201

/*
 * Use one ratelimit state per log level so that a flood of less important
 * messages doesn't cause more important ones to be dropped.
 */
static struct ratelimit_state printk_limits[] = {
	RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
	RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};

202
void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
J
Jeff Mahoney 已提交
203
{
204
	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
J
Jeff Mahoney 已提交
205 206
	struct va_format vaf;
	va_list args;
207
	int kern_level;
208 209
	const char *type = logtypes[4];
	struct ratelimit_state *ratelimit = &printk_limits[4];
J
Jeff Mahoney 已提交
210 211 212

	va_start(args, fmt);

213
	while ((kern_level = printk_get_level(fmt)) != 0) {
214
		size_t size = printk_skip_level(fmt) - fmt;
215 216 217 218 219 220 221

		if (kern_level >= '0' && kern_level <= '7') {
			memcpy(lvl, fmt,  size);
			lvl[size] = '\0';
			type = logtypes[kern_level - '0'];
			ratelimit = &printk_limits[kern_level - '0'];
		}
222
		fmt += size;
223 224
	}

J
Jeff Mahoney 已提交
225 226
	vaf.fmt = fmt;
	vaf.va = &args;
227

228
	if (__ratelimit(ratelimit))
229 230
		printk("%sBTRFS %s (device %s): %pV\n", lvl, type,
			fs_info ? fs_info->sb->s_id : "<unknown>", &vaf);
231 232 233 234

	va_end(args);
}
#endif
L
liubo 已提交
235

236 237 238 239 240 241 242 243 244 245 246 247 248
/*
 * We only mark the transaction aborted and then set the file system read-only.
 * This will prevent new transactions from starting or trying to join this
 * one.
 *
 * This means that error recovery at the call site is limited to freeing
 * any local memory allocations and passing the error code up without
 * further cleanup. The transaction should complete as it normally would
 * in the call path but will return -EIO.
 *
 * We'll complete the cleanup in btrfs_end_transaction and
 * btrfs_commit_transaction.
 */
249
__cold
250
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
251
			       const char *function,
252 253
			       unsigned int line, int errno)
{
254 255
	struct btrfs_fs_info *fs_info = trans->fs_info;

256
	WRITE_ONCE(trans->aborted, errno);
257 258
	/* Nothing used. The other threads that have joined this
	 * transaction may be able to continue. */
259
	if (!trans->dirty && list_empty(&trans->new_bgs)) {
260 261
		const char *errstr;

262
		errstr = btrfs_decode_error(errno);
263
		btrfs_warn(fs_info,
264 265
		           "%s:%d: Aborting unused transaction(%s).",
		           function, line, errstr);
L
liubo 已提交
266
		return;
267
	}
S
Seraphime Kirkovski 已提交
268
	WRITE_ONCE(trans->transaction->aborted, errno);
269
	/* Wake up anybody who may be waiting on this transaction */
270 271 272
	wake_up(&fs_info->transaction_wait);
	wake_up(&fs_info->transaction_blocked_wait);
	__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
273
}
J
Jeff Mahoney 已提交
274 275 276 277
/*
 * __btrfs_panic decodes unexpected, fatal errors from the caller,
 * issues an alert, and either panics or BUGs, depending on mount options.
 */
278
__cold
J
Jeff Mahoney 已提交
279 280 281 282 283 284 285
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
		   unsigned int line, int errno, const char *fmt, ...)
{
	char *s_id = "<unknown>";
	const char *errstr;
	struct va_format vaf = { .fmt = fmt };
	va_list args;
L
liubo 已提交
286

J
Jeff Mahoney 已提交
287 288
	if (fs_info)
		s_id = fs_info->sb->s_id;
L
liubo 已提交
289

J
Jeff Mahoney 已提交
290 291 292
	va_start(args, fmt);
	vaf.va = &args;

293
	errstr = btrfs_decode_error(errno);
294
	if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
295 296
		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
			s_id, function, line, &vaf, errno, errstr);
J
Jeff Mahoney 已提交
297

298 299
	btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
		   function, line, &vaf, errno, errstr);
J
Jeff Mahoney 已提交
300 301
	va_end(args);
	/* Caller calls BUG() */
L
liubo 已提交
302 303
}

C
Chris Mason 已提交
304
static void btrfs_put_super(struct super_block *sb)
C
Chris Mason 已提交
305
{
306
	close_ctree(btrfs_sb(sb));
C
Chris Mason 已提交
307 308
}

309
enum {
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327
	Opt_acl, Opt_noacl,
	Opt_clear_cache,
	Opt_commit_interval,
	Opt_compress,
	Opt_compress_force,
	Opt_compress_force_type,
	Opt_compress_type,
	Opt_degraded,
	Opt_device,
	Opt_fatal_errors,
	Opt_flushoncommit, Opt_noflushoncommit,
	Opt_inode_cache, Opt_noinode_cache,
	Opt_max_inline,
	Opt_barrier, Opt_nobarrier,
	Opt_datacow, Opt_nodatacow,
	Opt_datasum, Opt_nodatasum,
	Opt_defrag, Opt_nodefrag,
	Opt_discard, Opt_nodiscard,
328
	Opt_discard_mode,
329 330 331 332 333 334 335 336 337
	Opt_norecovery,
	Opt_ratio,
	Opt_rescan_uuid_tree,
	Opt_skip_balance,
	Opt_space_cache, Opt_no_space_cache,
	Opt_space_cache_version,
	Opt_ssd, Opt_nossd,
	Opt_ssd_spread, Opt_nossd_spread,
	Opt_subvol,
O
Omar Sandoval 已提交
338
	Opt_subvol_empty,
339 340 341 342 343
	Opt_subvolid,
	Opt_thread_pool,
	Opt_treelog, Opt_notreelog,
	Opt_user_subvol_rm_allowed,

344 345 346 347 348
	/* Rescue options */
	Opt_rescue,
	Opt_usebackuproot,
	Opt_nologreplay,

349 350 351 352 353
	/* Deprecated options */
	Opt_recovery,

	/* Debugging options */
	Opt_check_integrity,
354
	Opt_check_integrity_including_extent_data,
355 356
	Opt_check_integrity_print_mask,
	Opt_enospc_debug, Opt_noenospc_debug,
357 358
#ifdef CONFIG_BTRFS_DEBUG
	Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
J
Josef Bacik 已提交
359 360 361
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
	Opt_ref_verify,
362
#endif
363
	Opt_err,
364 365
};

D
David Sterba 已提交
366
static const match_table_t tokens = {
367 368 369 370
	{Opt_acl, "acl"},
	{Opt_noacl, "noacl"},
	{Opt_clear_cache, "clear_cache"},
	{Opt_commit_interval, "commit=%u"},
C
Chris Mason 已提交
371
	{Opt_compress, "compress"},
372
	{Opt_compress_type, "compress=%s"},
C
Chris Mason 已提交
373
	{Opt_compress_force, "compress-force"},
374
	{Opt_compress_force_type, "compress-force=%s"},
375 376 377
	{Opt_degraded, "degraded"},
	{Opt_device, "device=%s"},
	{Opt_fatal_errors, "fatal_errors=%s"},
378
	{Opt_flushoncommit, "flushoncommit"},
379
	{Opt_noflushoncommit, "noflushoncommit"},
380 381 382 383 384 385 386 387 388 389 390
	{Opt_inode_cache, "inode_cache"},
	{Opt_noinode_cache, "noinode_cache"},
	{Opt_max_inline, "max_inline=%s"},
	{Opt_barrier, "barrier"},
	{Opt_nobarrier, "nobarrier"},
	{Opt_datacow, "datacow"},
	{Opt_nodatacow, "nodatacow"},
	{Opt_datasum, "datasum"},
	{Opt_nodatasum, "nodatasum"},
	{Opt_defrag, "autodefrag"},
	{Opt_nodefrag, "noautodefrag"},
C
Christoph Hellwig 已提交
391
	{Opt_discard, "discard"},
392
	{Opt_discard_mode, "discard=%s"},
Q
Qu Wenruo 已提交
393
	{Opt_nodiscard, "nodiscard"},
394 395 396 397
	{Opt_norecovery, "norecovery"},
	{Opt_ratio, "metadata_ratio=%u"},
	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
	{Opt_skip_balance, "skip_balance"},
398
	{Opt_space_cache, "space_cache"},
399
	{Opt_no_space_cache, "nospace_cache"},
400 401 402 403 404 405
	{Opt_space_cache_version, "space_cache=%s"},
	{Opt_ssd, "ssd"},
	{Opt_nossd, "nossd"},
	{Opt_ssd_spread, "ssd_spread"},
	{Opt_nossd_spread, "nossd_spread"},
	{Opt_subvol, "subvol=%s"},
O
Omar Sandoval 已提交
406
	{Opt_subvol_empty, "subvol="},
407 408 409 410 411 412
	{Opt_subvolid, "subvolid=%s"},
	{Opt_thread_pool, "thread_pool=%u"},
	{Opt_treelog, "treelog"},
	{Opt_notreelog, "notreelog"},
	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},

413 414 415 416 417 418 419
	/* Rescue options */
	{Opt_rescue, "rescue=%s"},
	/* Deprecated, with alias rescue=nologreplay */
	{Opt_nologreplay, "nologreplay"},
	/* Deprecated, with alias rescue=usebackuproot */
	{Opt_usebackuproot, "usebackuproot"},

420 421 422 423
	/* Deprecated options */
	{Opt_recovery, "recovery"},

	/* Debugging options */
424 425
	{Opt_check_integrity, "check_int"},
	{Opt_check_integrity_including_extent_data, "check_int_data"},
426
	{Opt_check_integrity_print_mask, "check_int_print_mask=%u"},
427 428
	{Opt_enospc_debug, "enospc_debug"},
	{Opt_noenospc_debug, "noenospc_debug"},
429 430 431 432
#ifdef CONFIG_BTRFS_DEBUG
	{Opt_fragment_data, "fragment=data"},
	{Opt_fragment_metadata, "fragment=metadata"},
	{Opt_fragment_all, "fragment=all"},
J
Josef Bacik 已提交
433 434 435
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
	{Opt_ref_verify, "ref_verify"},
436
#endif
J
Josef Bacik 已提交
437
	{Opt_err, NULL},
438 439
};

440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
static const match_table_t rescue_tokens = {
	{Opt_usebackuproot, "usebackuproot"},
	{Opt_nologreplay, "nologreplay"},
	{Opt_err, NULL},
};

static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
{
	char *opts;
	char *orig;
	char *p;
	substring_t args[MAX_OPT_ARGS];
	int ret = 0;

	opts = kstrdup(options, GFP_KERNEL);
	if (!opts)
		return -ENOMEM;
	orig = opts;

	while ((p = strsep(&opts, ":")) != NULL) {
		int token;

		if (!*p)
			continue;
		token = match_token(p, rescue_tokens, args);
		switch (token){
		case Opt_usebackuproot:
			btrfs_info(info,
				   "trying to use backup root at mount time");
			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
			break;
		case Opt_nologreplay:
			btrfs_set_and_info(info, NOLOGREPLAY,
					   "disabling log replay at mount time");
			break;
		case Opt_err:
			btrfs_info(info, "unrecognized rescue option '%s'", p);
			ret = -EINVAL;
			goto out;
		default:
			break;
		}

	}
out:
	kfree(orig);
	return ret;
}

489 490 491
/*
 * Regular mount options parser.  Everything that is needed only when
 * reading in a new superblock is parsed here.
492
 * XXX JDM: This needs to be cleaned up for remount.
493
 */
494
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
495
			unsigned long new_flags)
496 497
{
	substring_t args[MAX_OPT_ARGS];
498
	char *p, *num;
499
	u64 cache_gen;
500
	int intarg;
S
Sage Weil 已提交
501
	int ret = 0;
502 503
	char *compress_type;
	bool compress_force = false;
504 505 506
	enum btrfs_compression_type saved_compress_type;
	bool saved_compress_force;
	int no_compress = 0;
507

508 509
	cache_gen = btrfs_super_cache_generation(info->super_copy);
	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
510 511
		btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
	else if (cache_gen)
512 513
		btrfs_set_opt(info->mount_opt, SPACE_CACHE);

514 515 516 517
	/*
	 * Even the options are empty, we still need to do extra check
	 * against new flags
	 */
518
	if (!options)
519
		goto check;
520

521
	while ((p = strsep(&options, ",")) != NULL) {
522 523 524 525 526 527
		int token;
		if (!*p)
			continue;

		token = match_token(p, tokens, args);
		switch (token) {
528
		case Opt_degraded:
529
			btrfs_info(info, "allowing degraded mounts");
530
			btrfs_set_opt(info->mount_opt, DEGRADED);
531
			break;
532
		case Opt_subvol:
O
Omar Sandoval 已提交
533
		case Opt_subvol_empty:
534
		case Opt_subvolid:
535
		case Opt_device:
536
			/*
537 538
			 * These are parsed by btrfs_parse_subvol_options or
			 * btrfs_parse_device_options and can be ignored here.
539
			 */
540 541
			break;
		case Opt_nodatasum:
542
			btrfs_set_and_info(info, NODATASUM,
543
					   "setting nodatasum");
544
			break;
Q
Qu Wenruo 已提交
545
		case Opt_datasum:
546 547
			if (btrfs_test_opt(info, NODATASUM)) {
				if (btrfs_test_opt(info, NODATACOW))
548
					btrfs_info(info,
J
Jeff Mahoney 已提交
549
						   "setting datasum, datacow enabled");
550
				else
551
					btrfs_info(info, "setting datasum");
552
			}
Q
Qu Wenruo 已提交
553 554 555
			btrfs_clear_opt(info->mount_opt, NODATACOW);
			btrfs_clear_opt(info->mount_opt, NODATASUM);
			break;
556
		case Opt_nodatacow:
557 558 559
			if (!btrfs_test_opt(info, NODATACOW)) {
				if (!btrfs_test_opt(info, COMPRESS) ||
				    !btrfs_test_opt(info, FORCE_COMPRESS)) {
560
					btrfs_info(info,
561 562
						   "setting nodatacow, compression disabled");
				} else {
563
					btrfs_info(info, "setting nodatacow");
564
				}
565 566 567
			}
			btrfs_clear_opt(info->mount_opt, COMPRESS);
			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
568 569
			btrfs_set_opt(info->mount_opt, NODATACOW);
			btrfs_set_opt(info->mount_opt, NODATASUM);
570
			break;
Q
Qu Wenruo 已提交
571
		case Opt_datacow:
572
			btrfs_clear_and_info(info, NODATACOW,
573
					     "setting datacow");
Q
Qu Wenruo 已提交
574
			break;
C
Chris Mason 已提交
575
		case Opt_compress_force:
576 577
		case Opt_compress_force_type:
			compress_force = true;
578
			fallthrough;
579 580
		case Opt_compress:
		case Opt_compress_type:
581 582
			saved_compress_type = btrfs_test_opt(info,
							     COMPRESS) ?
583 584
				info->compress_type : BTRFS_COMPRESS_NONE;
			saved_compress_force =
585
				btrfs_test_opt(info, FORCE_COMPRESS);
586 587
			if (token == Opt_compress ||
			    token == Opt_compress_force ||
588
			    strncmp(args[0].from, "zlib", 4) == 0) {
589
				compress_type = "zlib";
590

591
				info->compress_type = BTRFS_COMPRESS_ZLIB;
592 593 594 595 596 597 598 599 600
				info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
				/*
				 * args[0] contains uninitialized data since
				 * for these tokens we don't expect any
				 * parameter.
				 */
				if (token != Opt_compress &&
				    token != Opt_compress_force)
					info->compress_level =
601 602 603
					  btrfs_compress_str2level(
							BTRFS_COMPRESS_ZLIB,
							args[0].from + 4);
604
				btrfs_set_opt(info->mount_opt, COMPRESS);
605 606
				btrfs_clear_opt(info->mount_opt, NODATACOW);
				btrfs_clear_opt(info->mount_opt, NODATASUM);
607
				no_compress = 0;
608
			} else if (strncmp(args[0].from, "lzo", 3) == 0) {
L
Li Zefan 已提交
609 610
				compress_type = "lzo";
				info->compress_type = BTRFS_COMPRESS_LZO;
611
				btrfs_set_opt(info->mount_opt, COMPRESS);
612 613
				btrfs_clear_opt(info->mount_opt, NODATACOW);
				btrfs_clear_opt(info->mount_opt, NODATASUM);
614
				btrfs_set_fs_incompat(info, COMPRESS_LZO);
615
				no_compress = 0;
616
			} else if (strncmp(args[0].from, "zstd", 4) == 0) {
N
Nick Terrell 已提交
617 618
				compress_type = "zstd";
				info->compress_type = BTRFS_COMPRESS_ZSTD;
619 620 621 622
				info->compress_level =
					btrfs_compress_str2level(
							 BTRFS_COMPRESS_ZSTD,
							 args[0].from + 4);
N
Nick Terrell 已提交
623 624 625 626 627
				btrfs_set_opt(info->mount_opt, COMPRESS);
				btrfs_clear_opt(info->mount_opt, NODATACOW);
				btrfs_clear_opt(info->mount_opt, NODATASUM);
				btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
				no_compress = 0;
628 629 630 631 632
			} else if (strncmp(args[0].from, "no", 2) == 0) {
				compress_type = "no";
				btrfs_clear_opt(info->mount_opt, COMPRESS);
				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
				compress_force = false;
633
				no_compress++;
634 635 636 637 638 639
			} else {
				ret = -EINVAL;
				goto out;
			}

			if (compress_force) {
640
				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
641
			} else {
642 643 644 645 646 647 648
				/*
				 * If we remount from compress-force=xxx to
				 * compress=xxx, we need clear FORCE_COMPRESS
				 * flag, otherwise, there is no way for users
				 * to disable forcible compression separately.
				 */
				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
649
			}
650
			if ((btrfs_test_opt(info, COMPRESS) &&
651 652
			     (info->compress_type != saved_compress_type ||
			      compress_force != saved_compress_force)) ||
653
			    (!btrfs_test_opt(info, COMPRESS) &&
654
			     no_compress == 1)) {
655
				btrfs_info(info, "%s %s compression, level %d",
656
					   (compress_force) ? "force" : "use",
657
					   compress_type, info->compress_level);
658 659
			}
			compress_force = false;
C
Chris Mason 已提交
660
			break;
661
		case Opt_ssd:
662
			btrfs_set_and_info(info, SSD,
663
					   "enabling ssd optimizations");
664
			btrfs_clear_opt(info->mount_opt, NOSSD);
665
			break;
666
		case Opt_ssd_spread:
667 668
			btrfs_set_and_info(info, SSD,
					   "enabling ssd optimizations");
669
			btrfs_set_and_info(info, SSD_SPREAD,
670
					   "using spread ssd allocation scheme");
671
			btrfs_clear_opt(info->mount_opt, NOSSD);
672
			break;
C
Chris Mason 已提交
673
		case Opt_nossd:
674 675 676
			btrfs_set_opt(info->mount_opt, NOSSD);
			btrfs_clear_and_info(info, SSD,
					     "not using ssd optimizations");
677
			fallthrough;
678
		case Opt_nossd_spread:
679 680
			btrfs_clear_and_info(info, SSD_SPREAD,
					     "not using spread ssd allocation scheme");
C
Chris Mason 已提交
681
			break;
682
		case Opt_barrier:
683
			btrfs_clear_and_info(info, NOBARRIER,
684
					     "turning on barriers");
685
			break;
686
		case Opt_nobarrier:
687
			btrfs_set_and_info(info, NOBARRIER,
688
					   "turning off barriers");
689
			break;
690
		case Opt_thread_pool:
691 692 693
			ret = match_int(&args[0], &intarg);
			if (ret) {
				goto out;
694
			} else if (intarg == 0) {
695 696 697
				ret = -EINVAL;
				goto out;
			}
698
			info->thread_pool_size = intarg;
699
			break;
700
		case Opt_max_inline:
701 702
			num = match_strdup(&args[0]);
			if (num) {
A
Akinobu Mita 已提交
703
				info->max_inline = memparse(num, NULL);
704 705
				kfree(num);

C
Chris Mason 已提交
706
				if (info->max_inline) {
707
					info->max_inline = min_t(u64,
C
Chris Mason 已提交
708
						info->max_inline,
709
						info->sectorsize);
C
Chris Mason 已提交
710
				}
711 712
				btrfs_info(info, "max_inline at %llu",
					   info->max_inline);
713 714 715
			} else {
				ret = -ENOMEM;
				goto out;
716 717
			}
			break;
Q
Qu Wenruo 已提交
718
		case Opt_acl:
719
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
720
			info->sb->s_flags |= SB_POSIXACL;
Q
Qu Wenruo 已提交
721
			break;
722
#else
723
			btrfs_err(info, "support for ACL not compiled in!");
724 725 726
			ret = -EINVAL;
			goto out;
#endif
J
Josef Bacik 已提交
727
		case Opt_noacl:
728
			info->sb->s_flags &= ~SB_POSIXACL;
J
Josef Bacik 已提交
729
			break;
S
Sage Weil 已提交
730
		case Opt_notreelog:
731
			btrfs_set_and_info(info, NOTREELOG,
732
					   "disabling tree log");
Q
Qu Wenruo 已提交
733 734
			break;
		case Opt_treelog:
735
			btrfs_clear_and_info(info, NOTREELOG,
736
					     "enabling tree log");
S
Sage Weil 已提交
737
			break;
738
		case Opt_norecovery:
739
		case Opt_nologreplay:
740 741
			btrfs_warn(info,
		"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
742
			btrfs_set_and_info(info, NOLOGREPLAY,
743 744
					   "disabling log replay at mount time");
			break;
745
		case Opt_flushoncommit:
746
			btrfs_set_and_info(info, FLUSHONCOMMIT,
747
					   "turning on flush-on-commit");
748
			break;
749
		case Opt_noflushoncommit:
750
			btrfs_clear_and_info(info, FLUSHONCOMMIT,
751
					     "turning off flush-on-commit");
752
			break;
753
		case Opt_ratio:
754
			ret = match_int(&args[0], &intarg);
755
			if (ret)
756
				goto out;
757 758 759
			info->metadata_ratio = intarg;
			btrfs_info(info, "metadata ratio %u",
				   info->metadata_ratio);
760
			break;
C
Christoph Hellwig 已提交
761
		case Opt_discard:
762 763 764 765 766 767 768 769 770 771 772 773 774 775
		case Opt_discard_mode:
			if (token == Opt_discard ||
			    strcmp(args[0].from, "sync") == 0) {
				btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
				btrfs_set_and_info(info, DISCARD_SYNC,
						   "turning on sync discard");
			} else if (strcmp(args[0].from, "async") == 0) {
				btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
				btrfs_set_and_info(info, DISCARD_ASYNC,
						   "turning on async discard");
			} else {
				ret = -EINVAL;
				goto out;
			}
C
Christoph Hellwig 已提交
776
			break;
Q
Qu Wenruo 已提交
777
		case Opt_nodiscard:
778
			btrfs_clear_and_info(info, DISCARD_SYNC,
779
					     "turning off discard");
780 781
			btrfs_clear_and_info(info, DISCARD_ASYNC,
					     "turning off async discard");
Q
Qu Wenruo 已提交
782
			break;
783
		case Opt_space_cache:
784 785 786
		case Opt_space_cache_version:
			if (token == Opt_space_cache ||
			    strcmp(args[0].from, "v1") == 0) {
787
				btrfs_clear_opt(info->mount_opt,
788
						FREE_SPACE_TREE);
789
				btrfs_set_and_info(info, SPACE_CACHE,
790
					   "enabling disk space caching");
791
			} else if (strcmp(args[0].from, "v2") == 0) {
792
				btrfs_clear_opt(info->mount_opt,
793
						SPACE_CACHE);
794
				btrfs_set_and_info(info, FREE_SPACE_TREE,
795 796 797 798 799
						   "enabling free space tree");
			} else {
				ret = -EINVAL;
				goto out;
			}
800
			break;
801 802 803
		case Opt_rescan_uuid_tree:
			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
			break;
804
		case Opt_no_space_cache:
805
			if (btrfs_test_opt(info, SPACE_CACHE)) {
806 807
				btrfs_clear_and_info(info, SPACE_CACHE,
					     "disabling disk space caching");
808
			}
809
			if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
810 811
				btrfs_clear_and_info(info, FREE_SPACE_TREE,
					     "disabling free space tree");
812
			}
813
			break;
C
Chris Mason 已提交
814
		case Opt_inode_cache:
815 816
			btrfs_warn(info,
	"the 'inode_cache' option is deprecated and will have no effect from 5.11");
817
			btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
818
					   "enabling inode map caching");
819 820
			break;
		case Opt_noinode_cache:
821
			btrfs_clear_pending_and_info(info, INODE_MAP_CACHE,
822
					     "disabling inode map caching");
C
Chris Mason 已提交
823
			break;
824
		case Opt_clear_cache:
825
			btrfs_set_and_info(info, CLEAR_CACHE,
826
					   "force clearing of disk cache");
827
			break;
828 829 830
		case Opt_user_subvol_rm_allowed:
			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
			break;
831 832 833
		case Opt_enospc_debug:
			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
			break;
834 835 836
		case Opt_noenospc_debug:
			btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
			break;
C
Chris Mason 已提交
837
		case Opt_defrag:
838
			btrfs_set_and_info(info, AUTO_DEFRAG,
839
					   "enabling auto defrag");
C
Chris Mason 已提交
840
			break;
841
		case Opt_nodefrag:
842
			btrfs_clear_and_info(info, AUTO_DEFRAG,
843
					     "disabling auto defrag");
844
			break;
C
Chris Mason 已提交
845
		case Opt_recovery:
846
		case Opt_usebackuproot:
847 848 849 850
			btrfs_warn(info,
			"'%s' is deprecated, use 'rescue=usebackuproot' instead",
				   token == Opt_recovery ? "recovery" :
				   "usebackuproot");
851
			btrfs_info(info,
852 853
				   "trying to use backup root at mount time");
			btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
C
Chris Mason 已提交
854
			break;
855 856 857
		case Opt_skip_balance:
			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
			break;
858 859
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
		case Opt_check_integrity_including_extent_data:
860
			btrfs_info(info,
861
				   "enabling check integrity including extent data");
862 863 864 865 866
			btrfs_set_opt(info->mount_opt,
				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
			break;
		case Opt_check_integrity:
867
			btrfs_info(info, "enabling check integrity");
868 869 870
			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
			break;
		case Opt_check_integrity_print_mask:
871
			ret = match_int(&args[0], &intarg);
872
			if (ret)
873
				goto out;
874 875 876
			info->check_integrity_print_mask = intarg;
			btrfs_info(info, "check_integrity_print_mask 0x%x",
				   info->check_integrity_print_mask);
877 878 879 880 881
			break;
#else
		case Opt_check_integrity_including_extent_data:
		case Opt_check_integrity:
		case Opt_check_integrity_print_mask:
882 883
			btrfs_err(info,
				  "support for check_integrity* not compiled in!");
884 885 886
			ret = -EINVAL;
			goto out;
#endif
J
Jeff Mahoney 已提交
887 888 889 890 891 892 893 894 895 896 897 898
		case Opt_fatal_errors:
			if (strcmp(args[0].from, "panic") == 0)
				btrfs_set_opt(info->mount_opt,
					      PANIC_ON_FATAL_ERROR);
			else if (strcmp(args[0].from, "bug") == 0)
				btrfs_clear_opt(info->mount_opt,
					      PANIC_ON_FATAL_ERROR);
			else {
				ret = -EINVAL;
				goto out;
			}
			break;
899 900 901
		case Opt_commit_interval:
			intarg = 0;
			ret = match_int(&args[0], &intarg);
902
			if (ret)
903
				goto out;
904
			if (intarg == 0) {
905
				btrfs_info(info,
906
					   "using default commit interval %us",
J
Jeff Mahoney 已提交
907
					   BTRFS_DEFAULT_COMMIT_INTERVAL);
908 909 910 911
				intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
			} else if (intarg > 300) {
				btrfs_warn(info, "excessive commit interval %d",
					   intarg);
912
			}
913
			info->commit_interval = intarg;
914
			break;
915 916 917 918 919
		case Opt_rescue:
			ret = parse_rescue_options(info, args[0].from);
			if (ret < 0)
				goto out;
			break;
920 921
#ifdef CONFIG_BTRFS_DEBUG
		case Opt_fragment_all:
922
			btrfs_info(info, "fragmenting all space");
923 924 925 926
			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
			btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
			break;
		case Opt_fragment_metadata:
927
			btrfs_info(info, "fragmenting metadata");
928 929 930 931
			btrfs_set_opt(info->mount_opt,
				      FRAGMENT_METADATA);
			break;
		case Opt_fragment_data:
932
			btrfs_info(info, "fragmenting data");
933 934
			btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
			break;
J
Josef Bacik 已提交
935 936 937 938 939 940
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
		case Opt_ref_verify:
			btrfs_info(info, "doing ref verification");
			btrfs_set_opt(info->mount_opt, REF_VERIFY);
			break;
941
#endif
S
Sage Weil 已提交
942
		case Opt_err:
943
			btrfs_err(info, "unrecognized mount option '%s'", p);
S
Sage Weil 已提交
944 945
			ret = -EINVAL;
			goto out;
946
		default:
947
			break;
948 949
		}
	}
950 951 952 953
check:
	/*
	 * Extra check for current option against current flag
	 */
954
	if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) {
955
		btrfs_err(info,
956 957 958
			  "nologreplay must be used with ro mount option");
		ret = -EINVAL;
	}
S
Sage Weil 已提交
959
out:
960
	if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
961 962
	    !btrfs_test_opt(info, FREE_SPACE_TREE) &&
	    !btrfs_test_opt(info, CLEAR_CACHE)) {
963
		btrfs_err(info, "cannot disable free space tree");
964 965 966
		ret = -EINVAL;

	}
967
	if (!ret && btrfs_test_opt(info, SPACE_CACHE))
968
		btrfs_info(info, "disk space caching is enabled");
969
	if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
970
		btrfs_info(info, "using free space tree");
S
Sage Weil 已提交
971
	return ret;
972 973 974 975 976 977 978 979
}

/*
 * Parse mount options that are required early in the mount process.
 *
 * All other options will be parsed on much later in the mount process and
 * only when we need to allocate a new super block.
 */
980 981
static int btrfs_parse_device_options(const char *options, fmode_t flags,
				      void *holder)
982 983
{
	substring_t args[MAX_OPT_ARGS];
984
	char *device_name, *opts, *orig, *p;
985
	struct btrfs_device *device = NULL;
986 987
	int error = 0;

988 989
	lockdep_assert_held(&uuid_mutex);

990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014
	if (!options)
		return 0;

	/*
	 * strsep changes the string, duplicate it because btrfs_parse_options
	 * gets called later
	 */
	opts = kstrdup(options, GFP_KERNEL);
	if (!opts)
		return -ENOMEM;
	orig = opts;

	while ((p = strsep(&opts, ",")) != NULL) {
		int token;

		if (!*p)
			continue;

		token = match_token(p, tokens, args);
		if (token == Opt_device) {
			device_name = match_strdup(&args[0]);
			if (!device_name) {
				error = -ENOMEM;
				goto out;
			}
1015 1016
			device = btrfs_scan_one_device(device_name, flags,
					holder);
1017
			kfree(device_name);
1018 1019
			if (IS_ERR(device)) {
				error = PTR_ERR(device);
1020
				goto out;
1021
			}
1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
		}
	}

out:
	kfree(orig);
	return error;
}

/*
 * Parse mount options that are related to subvolume id
 *
 * The value is later passed to mount_subvol()
 */
1035 1036
static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
		u64 *subvol_objectid)
1037 1038 1039
{
	substring_t args[MAX_OPT_ARGS];
	char *opts, *orig, *p;
1040
	int error = 0;
1041
	u64 subvolid;
1042 1043

	if (!options)
1044
		return 0;
1045 1046

	/*
1047
	 * strsep changes the string, duplicate it because
1048
	 * btrfs_parse_device_options gets called later
1049 1050 1051 1052
	 */
	opts = kstrdup(options, GFP_KERNEL);
	if (!opts)
		return -ENOMEM;
1053
	orig = opts;
1054 1055 1056 1057 1058 1059 1060 1061 1062

	while ((p = strsep(&opts, ",")) != NULL) {
		int token;
		if (!*p)
			continue;

		token = match_token(p, tokens, args);
		switch (token) {
		case Opt_subvol:
1063
			kfree(*subvol_name);
1064
			*subvol_name = match_strdup(&args[0]);
1065 1066 1067 1068
			if (!*subvol_name) {
				error = -ENOMEM;
				goto out;
			}
1069
			break;
1070
		case Opt_subvolid:
1071 1072
			error = match_u64(&args[0], &subvolid);
			if (error)
1073
				goto out;
1074 1075 1076 1077 1078 1079

			/* we want the original fs_tree */
			if (subvolid == 0)
				subvolid = BTRFS_FS_TREE_OBJECTID;

			*subvol_objectid = subvolid;
1080
			break;
1081 1082 1083 1084 1085
		default:
			break;
		}
	}

1086
out:
1087
	kfree(orig);
1088
	return error;
1089 1090
}

1091 1092
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
					  u64 subvol_objectid)
1093
{
1094
	struct btrfs_root *root = fs_info->tree_root;
1095
	struct btrfs_root *fs_root = NULL;
1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111
	struct btrfs_root_ref *root_ref;
	struct btrfs_inode_ref *inode_ref;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	char *name = NULL, *ptr;
	u64 dirid;
	int len;
	int ret;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto err;
	}
	path->leave_spinning = 1;

1112
	name = kmalloc(PATH_MAX, GFP_KERNEL);
1113 1114 1115 1116 1117 1118
	if (!name) {
		ret = -ENOMEM;
		goto err;
	}
	ptr = name + PATH_MAX - 1;
	ptr[0] = '\0';
1119 1120

	/*
1121 1122
	 * Walk up the subvolume trees in the tree of tree roots by root
	 * backrefs until we hit the top-level subvolume.
1123
	 */
1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
	while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) {
		key.objectid = subvol_objectid;
		key.type = BTRFS_ROOT_BACKREF_KEY;
		key.offset = (u64)-1;

		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		if (ret < 0) {
			goto err;
		} else if (ret > 0) {
			ret = btrfs_previous_item(root, path, subvol_objectid,
						  BTRFS_ROOT_BACKREF_KEY);
			if (ret < 0) {
				goto err;
			} else if (ret > 0) {
				ret = -ENOENT;
				goto err;
			}
		}

		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		subvol_objectid = key.offset;

		root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
					  struct btrfs_root_ref);
		len = btrfs_root_ref_name_len(path->nodes[0], root_ref);
		ptr -= len + 1;
		if (ptr < name) {
			ret = -ENAMETOOLONG;
			goto err;
		}
		read_extent_buffer(path->nodes[0], ptr + 1,
				   (unsigned long)(root_ref + 1), len);
		ptr[0] = '/';
		dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref);
		btrfs_release_path(path);

D
David Sterba 已提交
1160
		fs_root = btrfs_get_fs_root(fs_info, subvol_objectid, true);
1161 1162
		if (IS_ERR(fs_root)) {
			ret = PTR_ERR(fs_root);
1163 1164 1165
			fs_root = NULL;
			goto err;
		}
1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207

		/*
		 * Walk up the filesystem tree by inode refs until we hit the
		 * root directory.
		 */
		while (dirid != BTRFS_FIRST_FREE_OBJECTID) {
			key.objectid = dirid;
			key.type = BTRFS_INODE_REF_KEY;
			key.offset = (u64)-1;

			ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
			if (ret < 0) {
				goto err;
			} else if (ret > 0) {
				ret = btrfs_previous_item(fs_root, path, dirid,
							  BTRFS_INODE_REF_KEY);
				if (ret < 0) {
					goto err;
				} else if (ret > 0) {
					ret = -ENOENT;
					goto err;
				}
			}

			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
			dirid = key.offset;

			inode_ref = btrfs_item_ptr(path->nodes[0],
						   path->slots[0],
						   struct btrfs_inode_ref);
			len = btrfs_inode_ref_name_len(path->nodes[0],
						       inode_ref);
			ptr -= len + 1;
			if (ptr < name) {
				ret = -ENAMETOOLONG;
				goto err;
			}
			read_extent_buffer(path->nodes[0], ptr + 1,
					   (unsigned long)(inode_ref + 1), len);
			ptr[0] = '/';
			btrfs_release_path(path);
		}
1208
		btrfs_put_root(fs_root);
1209
		fs_root = NULL;
1210 1211
	}

1212 1213 1214 1215 1216 1217 1218 1219 1220 1221
	btrfs_free_path(path);
	if (ptr == name + PATH_MAX - 1) {
		name[0] = '/';
		name[1] = '\0';
	} else {
		memmove(name, ptr, name + PATH_MAX - ptr);
	}
	return name;

err:
1222
	btrfs_put_root(fs_root);
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
	btrfs_free_path(path);
	kfree(name);
	return ERR_PTR(ret);
}

static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid)
{
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_dir_item *di;
	struct btrfs_path *path;
	struct btrfs_key location;
	u64 dir_id;

1236 1237
	path = btrfs_alloc_path();
	if (!path)
1238
		return -ENOMEM;
1239 1240 1241 1242 1243 1244 1245
	path->leave_spinning = 1;

	/*
	 * Find the "default" dir item which points to the root item that we
	 * will mount by default if we haven't been given a specific subvolume
	 * to mount.
	 */
1246
	dir_id = btrfs_super_root_dir(fs_info->super_copy);
1247
	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
1248 1249
	if (IS_ERR(di)) {
		btrfs_free_path(path);
1250
		return PTR_ERR(di);
1251
	}
1252 1253 1254 1255
	if (!di) {
		/*
		 * Ok the default dir item isn't there.  This is weird since
		 * it's always been there, but don't freak out, just try and
1256
		 * mount the top-level subvolume.
1257 1258
		 */
		btrfs_free_path(path);
1259 1260
		*objectid = BTRFS_FS_TREE_OBJECTID;
		return 0;
1261 1262 1263 1264
	}

	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
	btrfs_free_path(path);
1265 1266
	*objectid = location.objectid;
	return 0;
1267 1268
}

C
Chris Mason 已提交
1269
static int btrfs_fill_super(struct super_block *sb,
1270
			    struct btrfs_fs_devices *fs_devices,
1271
			    void *data)
C
Chris Mason 已提交
1272
{
C
Chris Mason 已提交
1273
	struct inode *inode;
1274
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
C
Chris Mason 已提交
1275
	int err;
1276

C
Chris Mason 已提交
1277 1278 1279
	sb->s_maxbytes = MAX_LFS_FILESIZE;
	sb->s_magic = BTRFS_SUPER_MAGIC;
	sb->s_op = &btrfs_super_ops;
A
Al Viro 已提交
1280
	sb->s_d_op = &btrfs_dentry_operations;
B
Balaji Rao 已提交
1281
	sb->s_export_op = &btrfs_export_ops;
J
Josef Bacik 已提交
1282
	sb->s_xattr = btrfs_xattr_handlers;
C
Chris Mason 已提交
1283
	sb->s_time_gran = 1;
C
Chris Mason 已提交
1284
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
1285
	sb->s_flags |= SB_POSIXACL;
1286
#endif
M
Matthew Garrett 已提交
1287
	sb->s_flags |= SB_I_VERSION;
1288
	sb->s_iflags |= SB_I_CGROUPWB;
1289 1290 1291 1292 1293 1294 1295

	err = super_setup_bdi(sb);
	if (err) {
		btrfs_err(fs_info, "super_setup_bdi failed");
		return err;
	}

A
Al Viro 已提交
1296 1297
	err = open_ctree(sb, fs_devices, (char *)data);
	if (err) {
1298
		btrfs_err(fs_info, "open_ctree failed");
A
Al Viro 已提交
1299
		return err;
1300 1301
	}

D
David Sterba 已提交
1302
	inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
1303 1304
	if (IS_ERR(inode)) {
		err = PTR_ERR(inode);
C
Chris Mason 已提交
1305
		goto fail_close;
C
Chris Mason 已提交
1306 1307
	}

1308 1309
	sb->s_root = d_make_root(inode);
	if (!sb->s_root) {
C
Chris Mason 已提交
1310 1311
		err = -ENOMEM;
		goto fail_close;
C
Chris Mason 已提交
1312
	}
1313

D
Dan Magenheimer 已提交
1314
	cleancache_init_fs(sb);
1315
	sb->s_flags |= SB_ACTIVE;
C
Chris Mason 已提交
1316
	return 0;
C
Chris Mason 已提交
1317 1318

fail_close:
1319
	close_ctree(fs_info);
C
Chris Mason 已提交
1320
	return err;
C
Chris Mason 已提交
1321 1322
}

S
Sage Weil 已提交
1323
int btrfs_sync_fs(struct super_block *sb, int wait)
C
Chris Mason 已提交
1324 1325
{
	struct btrfs_trans_handle *trans;
1326 1327
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *root = fs_info->tree_root;
C
Chris Mason 已提交
1328

1329
	trace_btrfs_sync_fs(fs_info, wait);
1330

C
Chris Mason 已提交
1331
	if (!wait) {
1332
		filemap_flush(fs_info->btree_inode->i_mapping);
C
Chris Mason 已提交
1333 1334
		return 0;
	}
1335

1336
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
1337

M
Miao Xie 已提交
1338
	trans = btrfs_attach_transaction_barrier(root);
1339
	if (IS_ERR(trans)) {
1340
		/* no transaction, don't bother */
1341 1342 1343 1344 1345 1346 1347
		if (PTR_ERR(trans) == -ENOENT) {
			/*
			 * Exit unless we have some pending changes
			 * that need to go through commit
			 */
			if (fs_info->pending_changes == 0)
				return 0;
1348 1349 1350 1351 1352 1353
			/*
			 * A non-blocking test if the fs is frozen. We must not
			 * start a new transaction here otherwise a deadlock
			 * happens. The pending operations are delayed to the
			 * next commit after thawing.
			 */
1354 1355
			if (sb_start_write_trylock(sb))
				sb_end_write(sb);
1356 1357
			else
				return 0;
1358 1359
			trans = btrfs_start_transaction(root, 0);
		}
1360 1361
		if (IS_ERR(trans))
			return PTR_ERR(trans);
1362
	}
1363
	return btrfs_commit_transaction(trans);
C
Chris Mason 已提交
1364 1365
}

1366
static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
E
Eric Paris 已提交
1367
{
1368
	struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
1369
	const char *compress_type;
E
Eric Paris 已提交
1370

1371
	if (btrfs_test_opt(info, DEGRADED))
E
Eric Paris 已提交
1372
		seq_puts(seq, ",degraded");
1373
	if (btrfs_test_opt(info, NODATASUM))
E
Eric Paris 已提交
1374
		seq_puts(seq, ",nodatasum");
1375
	if (btrfs_test_opt(info, NODATACOW))
E
Eric Paris 已提交
1376
		seq_puts(seq, ",nodatacow");
1377
	if (btrfs_test_opt(info, NOBARRIER))
E
Eric Paris 已提交
1378
		seq_puts(seq, ",nobarrier");
1379
	if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
1380
		seq_printf(seq, ",max_inline=%llu", info->max_inline);
E
Eric Paris 已提交
1381 1382
	if (info->thread_pool_size !=  min_t(unsigned long,
					     num_online_cpus() + 2, 8))
1383
		seq_printf(seq, ",thread_pool=%u", info->thread_pool_size);
1384
	if (btrfs_test_opt(info, COMPRESS)) {
1385
		compress_type = btrfs_compress_type2str(info->compress_type);
1386
		if (btrfs_test_opt(info, FORCE_COMPRESS))
T
Tsutomu Itoh 已提交
1387 1388 1389
			seq_printf(seq, ",compress-force=%s", compress_type);
		else
			seq_printf(seq, ",compress=%s", compress_type);
1390
		if (info->compress_level)
1391
			seq_printf(seq, ":%d", info->compress_level);
T
Tsutomu Itoh 已提交
1392
	}
1393
	if (btrfs_test_opt(info, NOSSD))
C
Chris Mason 已提交
1394
		seq_puts(seq, ",nossd");
1395
	if (btrfs_test_opt(info, SSD_SPREAD))
1396
		seq_puts(seq, ",ssd_spread");
1397
	else if (btrfs_test_opt(info, SSD))
E
Eric Paris 已提交
1398
		seq_puts(seq, ",ssd");
1399
	if (btrfs_test_opt(info, NOTREELOG))
1400
		seq_puts(seq, ",notreelog");
1401
	if (btrfs_test_opt(info, NOLOGREPLAY))
1402
		seq_puts(seq, ",rescue=nologreplay");
1403
	if (btrfs_test_opt(info, FLUSHONCOMMIT))
1404
		seq_puts(seq, ",flushoncommit");
1405
	if (btrfs_test_opt(info, DISCARD_SYNC))
1406
		seq_puts(seq, ",discard");
1407 1408
	if (btrfs_test_opt(info, DISCARD_ASYNC))
		seq_puts(seq, ",discard=async");
1409
	if (!(info->sb->s_flags & SB_POSIXACL))
E
Eric Paris 已提交
1410
		seq_puts(seq, ",noacl");
1411
	if (btrfs_test_opt(info, SPACE_CACHE))
T
Tsutomu Itoh 已提交
1412
		seq_puts(seq, ",space_cache");
1413
	else if (btrfs_test_opt(info, FREE_SPACE_TREE))
1414
		seq_puts(seq, ",space_cache=v2");
1415
	else
1416
		seq_puts(seq, ",nospace_cache");
1417
	if (btrfs_test_opt(info, RESCAN_UUID_TREE))
1418
		seq_puts(seq, ",rescan_uuid_tree");
1419
	if (btrfs_test_opt(info, CLEAR_CACHE))
T
Tsutomu Itoh 已提交
1420
		seq_puts(seq, ",clear_cache");
1421
	if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
T
Tsutomu Itoh 已提交
1422
		seq_puts(seq, ",user_subvol_rm_allowed");
1423
	if (btrfs_test_opt(info, ENOSPC_DEBUG))
1424
		seq_puts(seq, ",enospc_debug");
1425
	if (btrfs_test_opt(info, AUTO_DEFRAG))
1426
		seq_puts(seq, ",autodefrag");
1427
	if (btrfs_test_opt(info, INODE_MAP_CACHE))
1428
		seq_puts(seq, ",inode_cache");
1429
	if (btrfs_test_opt(info, SKIP_BALANCE))
1430
		seq_puts(seq, ",skip_balance");
1431
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
1432
	if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
1433
		seq_puts(seq, ",check_int_data");
1434
	else if (btrfs_test_opt(info, CHECK_INTEGRITY))
1435 1436 1437 1438 1439 1440
		seq_puts(seq, ",check_int");
	if (info->check_integrity_print_mask)
		seq_printf(seq, ",check_int_print_mask=%d",
				info->check_integrity_print_mask);
#endif
	if (info->metadata_ratio)
1441
		seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio);
1442
	if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
J
Jeff Mahoney 已提交
1443
		seq_puts(seq, ",fatal_errors=panic");
1444
	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
1445
		seq_printf(seq, ",commit=%u", info->commit_interval);
1446
#ifdef CONFIG_BTRFS_DEBUG
1447
	if (btrfs_test_opt(info, FRAGMENT_DATA))
1448
		seq_puts(seq, ",fragment=data");
1449
	if (btrfs_test_opt(info, FRAGMENT_METADATA))
1450 1451
		seq_puts(seq, ",fragment=metadata");
#endif
J
Josef Bacik 已提交
1452 1453
	if (btrfs_test_opt(info, REF_VERIFY))
		seq_puts(seq, ",ref_verify");
1454 1455 1456 1457
	seq_printf(seq, ",subvolid=%llu",
		  BTRFS_I(d_inode(dentry))->root->root_key.objectid);
	seq_puts(seq, ",subvol=");
	seq_dentry(seq, dentry, " \t\n\\");
E
Eric Paris 已提交
1458 1459 1460
	return 0;
}

1461
static int btrfs_test_super(struct super_block *s, void *data)
Y
Yan 已提交
1462
{
1463 1464
	struct btrfs_fs_info *p = data;
	struct btrfs_fs_info *fs_info = btrfs_sb(s);
Y
Yan 已提交
1465

1466
	return fs_info->fs_devices == p->fs_devices;
Y
Yan 已提交
1467 1468
}

1469 1470
static int btrfs_set_super(struct super_block *s, void *data)
{
A
Al Viro 已提交
1471 1472 1473 1474
	int err = set_anon_super(s, data);
	if (!err)
		s->s_fs_info = data;
	return err;
Y
Yan 已提交
1475 1476
}

1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
/*
 * subvolumes are identified by ino 256
 */
static inline int is_subvolume_inode(struct inode *inode)
{
	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
		return 1;
	return 0;
}

1487
static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
1488
				   struct vfsmount *mnt)
1489 1490
{
	struct dentry *root;
1491
	int ret;
1492

1493 1494 1495 1496 1497 1498 1499 1500 1501
	if (!subvol_name) {
		if (!subvol_objectid) {
			ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb),
							  &subvol_objectid);
			if (ret) {
				root = ERR_PTR(ret);
				goto out;
			}
		}
1502 1503
		subvol_name = btrfs_get_subvol_name_from_objectid(
					btrfs_sb(mnt->mnt_sb), subvol_objectid);
1504 1505 1506 1507 1508 1509 1510 1511
		if (IS_ERR(subvol_name)) {
			root = ERR_CAST(subvol_name);
			subvol_name = NULL;
			goto out;
		}

	}

A
Al Viro 已提交
1512
	root = mount_subtree(mnt, subvol_name);
1513 1514
	/* mount_subtree() drops our reference on the vfsmount. */
	mnt = NULL;
1515

1516
	if (!IS_ERR(root)) {
A
Al Viro 已提交
1517
		struct super_block *s = root->d_sb;
1518
		struct btrfs_fs_info *fs_info = btrfs_sb(s);
1519 1520 1521 1522 1523
		struct inode *root_inode = d_inode(root);
		u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid;

		ret = 0;
		if (!is_subvolume_inode(root_inode)) {
1524
			btrfs_err(fs_info, "'%s' is not a valid subvolume",
1525 1526 1527 1528
			       subvol_name);
			ret = -EINVAL;
		}
		if (subvol_objectid && root_objectid != subvol_objectid) {
1529 1530 1531 1532 1533
			/*
			 * This will also catch a race condition where a
			 * subvolume which was passed by ID is renamed and
			 * another subvolume is renamed over the old location.
			 */
1534 1535 1536
			btrfs_err(fs_info,
				  "subvol '%s' does not match subvolid %llu",
				  subvol_name, subvol_objectid);
1537 1538 1539 1540 1541 1542 1543
			ret = -EINVAL;
		}
		if (ret) {
			dput(root);
			root = ERR_PTR(ret);
			deactivate_locked_super(s);
		}
1544 1545
	}

1546 1547 1548
out:
	mntput(mnt);
	kfree(subvol_name);
1549 1550
	return root;
}
1551

1552 1553 1554 1555 1556 1557
/*
 * Find a superblock for the given device / mount point.
 *
 * Note: This is based on mount_bdev from fs/super.c with a few additions
 *       for multiple device setup.  Make sure to keep it in sync.
 */
1558 1559 1560 1561 1562
static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
		int flags, const char *device_name, void *data)
{
	struct block_device *bdev = NULL;
	struct super_block *s;
1563
	struct btrfs_device *device = NULL;
1564 1565
	struct btrfs_fs_devices *fs_devices = NULL;
	struct btrfs_fs_info *fs_info = NULL;
1566
	void *new_sec_opts = NULL;
1567 1568 1569 1570 1571 1572 1573
	fmode_t mode = FMODE_READ;
	int error = 0;

	if (!(flags & SB_RDONLY))
		mode |= FMODE_WRITE;

	if (data) {
A
Al Viro 已提交
1574
		error = security_sb_eat_lsm_opts(data, &new_sec_opts);
1575 1576 1577 1578 1579 1580 1581
		if (error)
			return ERR_PTR(error);
	}

	/*
	 * Setup a dummy root and fs_info for test/set super.  This is because
	 * we don't actually fill this stuff out until open_ctree, but we need
1582 1583 1584 1585
	 * then open_ctree will properly initialize the file system specific
	 * settings later.  btrfs_init_fs_info initializes the static elements
	 * of the fs_info (locks and such) to make cleanup easier if we find a
	 * superblock with our given fs_devices later on at sget() time.
1586
	 */
1587
	fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
1588 1589 1590 1591
	if (!fs_info) {
		error = -ENOMEM;
		goto error_sec_opts;
	}
1592
	btrfs_init_fs_info(fs_info);
1593 1594 1595 1596 1597 1598 1599 1600

	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
	if (!fs_info->super_copy || !fs_info->super_for_commit) {
		error = -ENOMEM;
		goto error_fs_info;
	}

1601
	mutex_lock(&uuid_mutex);
1602
	error = btrfs_parse_device_options(data, mode, fs_type);
1603 1604
	if (error) {
		mutex_unlock(&uuid_mutex);
1605
		goto error_fs_info;
1606
	}
1607

1608 1609
	device = btrfs_scan_one_device(device_name, mode, fs_type);
	if (IS_ERR(device)) {
1610
		mutex_unlock(&uuid_mutex);
1611
		error = PTR_ERR(device);
1612
		goto error_fs_info;
1613
	}
1614

1615
	fs_devices = device->fs_devices;
1616 1617
	fs_info->fs_devices = fs_devices;

1618
	error = btrfs_open_devices(fs_devices, mode, fs_type);
1619
	mutex_unlock(&uuid_mutex);
1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637
	if (error)
		goto error_fs_info;

	if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
		error = -EACCES;
		goto error_close_devices;
	}

	bdev = fs_devices->latest_bdev;
	s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
		 fs_info);
	if (IS_ERR(s)) {
		error = PTR_ERR(s);
		goto error_close_devices;
	}

	if (s->s_root) {
		btrfs_close_devices(fs_devices);
1638
		btrfs_free_fs_info(fs_info);
1639 1640 1641 1642 1643
		if ((flags ^ s->s_flags) & SB_RDONLY)
			error = -EBUSY;
	} else {
		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
		btrfs_sb(s)->bdev_holder = fs_type;
1644 1645
		if (!strstr(crc32c_impl(), "generic"))
			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
1646 1647
		error = btrfs_fill_super(s, fs_devices, data);
	}
A
Al Viro 已提交
1648
	if (!error)
1649
		error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
A
Al Viro 已提交
1650
	security_free_mnt_opts(&new_sec_opts);
1651 1652
	if (error) {
		deactivate_locked_super(s);
A
Al Viro 已提交
1653
		return ERR_PTR(error);
1654 1655 1656 1657 1658 1659 1660
	}

	return dget(s->s_root);

error_close_devices:
	btrfs_close_devices(fs_devices);
error_fs_info:
1661
	btrfs_free_fs_info(fs_info);
1662 1663 1664 1665
error_sec_opts:
	security_free_mnt_opts(&new_sec_opts);
	return ERR_PTR(error);
}
1666

1667
/*
1668
 * Mount function which is called by VFS layer.
1669
 *
1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
 * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
 * which needs vfsmount* of device's root (/).  This means device's root has to
 * be mounted internally in any case.
 *
 * Operation flow:
 *   1. Parse subvol id related options for later use in mount_subvol().
 *
 *   2. Mount device's root (/) by calling vfs_kern_mount().
 *
 *      NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
 *      first place. In order to avoid calling btrfs_mount() again, we use
 *      different file_system_type which is not registered to VFS by
 *      register_filesystem() (btrfs_root_fs_type). As a result,
 *      btrfs_mount_root() is called. The return value will be used by
 *      mount_subtree() in mount_subvol().
 *
 *   3. Call mount_subvol() to get the dentry of subvolume. Since there is
 *      "btrfs subvolume set-default", mount_subvol() is called always.
1688
 */
A
Al Viro 已提交
1689
static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
1690
		const char *device_name, void *data)
Y
Yan 已提交
1691
{
1692 1693
	struct vfsmount *mnt_root;
	struct dentry *root;
1694 1695
	char *subvol_name = NULL;
	u64 subvol_objectid = 0;
Y
Yan 已提交
1696 1697
	int error = 0;

1698 1699
	error = btrfs_parse_subvol_options(data, &subvol_name,
					&subvol_objectid);
1700 1701
	if (error) {
		kfree(subvol_name);
A
Al Viro 已提交
1702
		return ERR_PTR(error);
1703
	}
1704

1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
	/* mount device's root (/) */
	mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
	if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
		if (flags & SB_RDONLY) {
			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
				flags & ~SB_RDONLY, device_name, data);
		} else {
			mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
				flags | SB_RDONLY, device_name, data);
			if (IS_ERR(mnt_root)) {
				root = ERR_CAST(mnt_root);
1716
				kfree(subvol_name);
1717 1718
				goto out;
			}
Y
Yan 已提交
1719

1720 1721 1722 1723 1724 1725
			down_write(&mnt_root->mnt_sb->s_umount);
			error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
			up_write(&mnt_root->mnt_sb->s_umount);
			if (error < 0) {
				root = ERR_PTR(error);
				mntput(mnt_root);
1726
				kfree(subvol_name);
1727 1728 1729
				goto out;
			}
		}
1730
	}
1731 1732
	if (IS_ERR(mnt_root)) {
		root = ERR_CAST(mnt_root);
1733
		kfree(subvol_name);
1734
		goto out;
1735
	}
Y
Yan 已提交
1736

1737
	/* mount_subvol() will free subvol_name and mnt_root */
1738
	root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
Y
Yan 已提交
1739

1740 1741
out:
	return root;
Y
Yan 已提交
1742
}
1743

1744
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
1745
				     u32 new_pool_size, u32 old_pool_size)
1746 1747 1748 1749 1750 1751
{
	if (new_pool_size == old_pool_size)
		return;

	fs_info->thread_pool_size = new_pool_size;

1752
	btrfs_info(fs_info, "resize thread pool %d -> %d",
1753 1754
	       old_pool_size, new_pool_size);

1755
	btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
1756
	btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
1757
	btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
1758 1759 1760 1761 1762 1763
	btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
	btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size);
	btrfs_workqueue_set_max(fs_info->endio_meta_write_workers,
				new_pool_size);
	btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
	btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
1764
	btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
1765
	btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
1766 1767
	btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
				new_pool_size);
1768 1769
}

1770
static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
M
Miao Xie 已提交
1771 1772
{
	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
1773
}
M
Miao Xie 已提交
1774

1775 1776 1777
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
				       unsigned long old_opts, int flags)
{
M
Miao Xie 已提交
1778 1779
	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
1780
	     (flags & SB_RDONLY))) {
M
Miao Xie 已提交
1781 1782 1783
		/* wait for any defraggers to finish */
		wait_event(fs_info->transaction_wait,
			   (atomic_read(&fs_info->defrag_running) == 0));
1784
		if (flags & SB_RDONLY)
M
Miao Xie 已提交
1785 1786 1787 1788 1789 1790 1791 1792
			sync_filesystem(fs_info->sb);
	}
}

static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
					 unsigned long old_opts)
{
	/*
1793 1794
	 * We need to cleanup all defragable inodes if the autodefragment is
	 * close or the filesystem is read only.
M
Miao Xie 已提交
1795 1796
	 */
	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1797
	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || sb_rdonly(fs_info->sb))) {
M
Miao Xie 已提交
1798 1799 1800
		btrfs_cleanup_defrag_inodes(fs_info);
	}

1801 1802 1803 1804 1805 1806 1807 1808
	/* If we toggled discard async */
	if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
	    btrfs_test_opt(fs_info, DISCARD_ASYNC))
		btrfs_discard_resume(fs_info);
	else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) &&
		 !btrfs_test_opt(fs_info, DISCARD_ASYNC))
		btrfs_discard_cleanup(fs_info);

M
Miao Xie 已提交
1809 1810 1811
	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
}

Y
Yan Zheng 已提交
1812 1813
static int btrfs_remount(struct super_block *sb, int *flags, char *data)
{
1814 1815
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *root = fs_info->tree_root;
1816 1817 1818 1819
	unsigned old_flags = sb->s_flags;
	unsigned long old_opts = fs_info->mount_opt;
	unsigned long old_compress_type = fs_info->compress_type;
	u64 old_max_inline = fs_info->max_inline;
1820
	u32 old_thread_pool_size = fs_info->thread_pool_size;
1821
	u32 old_metadata_ratio = fs_info->metadata_ratio;
Y
Yan Zheng 已提交
1822 1823
	int ret;

1824
	sync_filesystem(sb);
1825
	btrfs_remount_prepare(fs_info);
M
Miao Xie 已提交
1826

1827
	if (data) {
1828
		void *new_sec_opts = NULL;
1829

A
Al Viro 已提交
1830 1831
		ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
		if (!ret)
1832
			ret = security_sb_remount(sb, new_sec_opts);
A
Al Viro 已提交
1833
		security_free_mnt_opts(&new_sec_opts);
1834 1835 1836 1837
		if (ret)
			goto restore;
	}

1838
	ret = btrfs_parse_options(fs_info, data, *flags);
1839
	if (ret)
1840
		goto restore;
1841

1842
	btrfs_remount_begin(fs_info, old_opts, *flags);
1843 1844 1845
	btrfs_resize_thread_pool(fs_info,
		fs_info->thread_pool_size, old_thread_pool_size);

1846
	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
M
Miao Xie 已提交
1847
		goto out;
Y
Yan Zheng 已提交
1848

1849
	if (*flags & SB_RDONLY) {
1850 1851 1852 1853
		/*
		 * this also happens on 'umount -rf' or on shutdown, when
		 * the filesystem is busy.
		 */
1854
		cancel_work_sync(&fs_info->async_reclaim_work);
1855

1856 1857
		btrfs_discard_cleanup(fs_info);

1858 1859 1860 1861 1862
		/* wait for the uuid_scan task to finish */
		down(&fs_info->uuid_tree_rescan_sem);
		/* avoid complains from lockdep et al. */
		up(&fs_info->uuid_tree_rescan_sem);

1863
		sb->s_flags |= SB_RDONLY;
Y
Yan Zheng 已提交
1864

1865
		/*
1866
		 * Setting SB_RDONLY will put the cleaner thread to
1867 1868 1869 1870 1871 1872 1873
		 * sleep at the next loop if it's already active.
		 * If it's already asleep, we'll leave unused block
		 * groups on disk until we're mounted read-write again
		 * unless we clean them up here.
		 */
		btrfs_delete_unused_bgs(fs_info);

1874 1875
		btrfs_dev_replace_suspend_for_unmount(fs_info);
		btrfs_scrub_cancel(fs_info);
1876
		btrfs_pause_balance(fs_info);
1877

1878
		ret = btrfs_commit_super(fs_info);
1879 1880
		if (ret)
			goto restore;
Y
Yan Zheng 已提交
1881
	} else {
1882
		if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
1883
			btrfs_err(fs_info,
1884
				"Remounting read-write after error is not allowed");
1885 1886 1887
			ret = -EINVAL;
			goto restore;
		}
1888
		if (fs_info->fs_devices->rw_devices == 0) {
1889 1890
			ret = -EACCES;
			goto restore;
1891
		}
Y
Yan Zheng 已提交
1892

1893
		if (!btrfs_check_rw_degradable(fs_info, NULL)) {
1894
			btrfs_warn(fs_info,
1895
		"too many missing devices, writable remount is not allowed");
1896 1897 1898 1899
			ret = -EACCES;
			goto restore;
		}

1900
		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
1901 1902
			btrfs_warn(fs_info,
		"mount required to replay tree-log, cannot remount read-write");
1903 1904
			ret = -EINVAL;
			goto restore;
1905
		}
Y
Yan Zheng 已提交
1906

1907
		ret = btrfs_cleanup_fs_roots(fs_info);
1908 1909
		if (ret)
			goto restore;
Y
Yan Zheng 已提交
1910

1911
		/* recover relocation */
1912
		mutex_lock(&fs_info->cleaner_mutex);
1913
		ret = btrfs_recover_relocation(root);
1914
		mutex_unlock(&fs_info->cleaner_mutex);
1915 1916
		if (ret)
			goto restore;
Y
Yan Zheng 已提交
1917

1918 1919 1920 1921
		ret = btrfs_resume_balance_async(fs_info);
		if (ret)
			goto restore;

1922 1923
		ret = btrfs_resume_dev_replace_async(fs_info);
		if (ret) {
1924
			btrfs_warn(fs_info, "failed to resume dev_replace");
1925 1926
			goto restore;
		}
1927

1928 1929
		btrfs_qgroup_rescan_resume(fs_info);

1930
		if (!fs_info->uuid_root) {
1931
			btrfs_info(fs_info, "creating UUID tree");
1932 1933
			ret = btrfs_create_uuid_tree(fs_info);
			if (ret) {
J
Jeff Mahoney 已提交
1934 1935 1936
				btrfs_warn(fs_info,
					   "failed to create the UUID tree %d",
					   ret);
1937 1938 1939
				goto restore;
			}
		}
1940
		sb->s_flags &= ~SB_RDONLY;
1941

1942
		set_bit(BTRFS_FS_OPEN, &fs_info->flags);
Y
Yan Zheng 已提交
1943
	}
M
Miao Xie 已提交
1944
out:
1945
	wake_up_process(fs_info->transaction_kthread);
M
Miao Xie 已提交
1946
	btrfs_remount_cleanup(fs_info, old_opts);
Y
Yan Zheng 已提交
1947
	return 0;
1948 1949

restore:
1950
	/* We've hit an error - don't reset SB_RDONLY */
1951
	if (sb_rdonly(sb))
1952
		old_flags |= SB_RDONLY;
1953 1954 1955 1956
	sb->s_flags = old_flags;
	fs_info->mount_opt = old_opts;
	fs_info->compress_type = old_compress_type;
	fs_info->max_inline = old_max_inline;
1957 1958
	btrfs_resize_thread_pool(fs_info,
		old_thread_pool_size, fs_info->thread_pool_size);
1959
	fs_info->metadata_ratio = old_metadata_ratio;
M
Miao Xie 已提交
1960
	btrfs_remount_cleanup(fs_info, old_opts);
1961
	return ret;
Y
Yan Zheng 已提交
1962 1963
}

1964
/* Used to sort the devices by max_avail(descending sort) */
1965
static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
				       const void *dev_info2)
{
	if (((struct btrfs_device_info *)dev_info1)->max_avail >
	    ((struct btrfs_device_info *)dev_info2)->max_avail)
		return -1;
	else if (((struct btrfs_device_info *)dev_info1)->max_avail <
		 ((struct btrfs_device_info *)dev_info2)->max_avail)
		return 1;
	else
	return 0;
}

/*
 * sort the devices by max_avail, in which max free extent size of each device
 * is stored.(Descending Sort)
 */
static inline void btrfs_descending_sort_devices(
					struct btrfs_device_info *devices,
					size_t nr_devices)
{
	sort(devices, nr_devices, sizeof(struct btrfs_device_info),
	     btrfs_cmp_device_free_bytes, NULL);
}

1990 1991 1992 1993
/*
 * The helper to calc the free space on the devices that can be used to store
 * file data.
 */
1994 1995
static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
					      u64 *free_bytes)
1996 1997 1998 1999 2000 2001 2002
{
	struct btrfs_device_info *devices_info;
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
	u64 type;
	u64 avail_space;
	u64 min_stripe_size;
2003
	int num_stripes = 1;
2004
	int i = 0, nr_devices;
2005
	const struct btrfs_raid_attr *rattr;
2006

2007
	/*
2008
	 * We aren't under the device list lock, so this is racy-ish, but good
2009 2010
	 * enough for our purposes.
	 */
2011
	nr_devices = fs_info->fs_devices->open_devices;
2012 2013 2014 2015 2016 2017 2018 2019 2020
	if (!nr_devices) {
		smp_mb();
		nr_devices = fs_info->fs_devices->open_devices;
		ASSERT(nr_devices);
		if (!nr_devices) {
			*free_bytes = 0;
			return 0;
		}
	}
2021

2022
	devices_info = kmalloc_array(nr_devices, sizeof(*devices_info),
2023
			       GFP_KERNEL);
2024 2025 2026
	if (!devices_info)
		return -ENOMEM;

2027
	/* calc min stripe number for data space allocation */
2028
	type = btrfs_data_alloc_profile(fs_info);
2029 2030
	rattr = &btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)];

2031
	if (type & BTRFS_BLOCK_GROUP_RAID0)
2032
		num_stripes = nr_devices;
2033
	else if (type & BTRFS_BLOCK_GROUP_RAID1)
2034
		num_stripes = 2;
2035 2036
	else if (type & BTRFS_BLOCK_GROUP_RAID1C3)
		num_stripes = 3;
2037 2038
	else if (type & BTRFS_BLOCK_GROUP_RAID1C4)
		num_stripes = 4;
2039
	else if (type & BTRFS_BLOCK_GROUP_RAID10)
2040
		num_stripes = 4;
2041

2042 2043
	/* Adjust for more than 1 stripe per device */
	min_stripe_size = rattr->dev_stripes * BTRFS_STRIPE_LEN;
2044

2045 2046
	rcu_read_lock();
	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2047 2048
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
						&device->dev_state) ||
2049 2050
		    !device->bdev ||
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
2051 2052
			continue;

2053 2054 2055
		if (i >= nr_devices)
			break;

2056 2057 2058
		avail_space = device->total_bytes - device->bytes_used;

		/* align with stripe_len */
2059
		avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN);
2060 2061

		/*
2062
		 * In order to avoid overwriting the superblock on the drive,
2063 2064
		 * btrfs starts at an offset of at least 1MB when doing chunk
		 * allocation.
2065 2066 2067
		 *
		 * This ensures we have at least min_stripe_size free space
		 * after excluding 1MB.
2068
		 */
2069
		if (avail_space <= SZ_1M + min_stripe_size)
2070 2071
			continue;

2072 2073
		avail_space -= SZ_1M;

2074 2075 2076 2077 2078
		devices_info[i].dev = device;
		devices_info[i].max_avail = avail_space;

		i++;
	}
2079
	rcu_read_unlock();
2080 2081 2082 2083 2084 2085 2086

	nr_devices = i;

	btrfs_descending_sort_devices(devices_info, nr_devices);

	i = nr_devices - 1;
	avail_space = 0;
2087 2088
	while (nr_devices >= rattr->devs_min) {
		num_stripes = min(num_stripes, nr_devices);
2089

2090 2091 2092 2093
		if (devices_info[i].max_avail >= min_stripe_size) {
			int j;
			u64 alloc_size;

2094
			avail_space += devices_info[i].max_avail * num_stripes;
2095
			alloc_size = devices_info[i].max_avail;
2096
			for (j = i + 1 - num_stripes; j <= i; j++)
2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107
				devices_info[j].max_avail -= alloc_size;
		}
		i--;
		nr_devices--;
	}

	kfree(devices_info);
	*free_bytes = avail_space;
	return 0;
}

2108 2109 2110 2111 2112 2113 2114
/*
 * Calculate numbers for 'df', pessimistic in case of mixed raid profiles.
 *
 * If there's a redundant raid level at DATA block groups, use the respective
 * multiplier to scale the sizes.
 *
 * Unused device space usage is based on simulating the chunk allocator
2115 2116 2117
 * algorithm that respects the device sizes and order of allocations.  This is
 * a close approximation of the actual use but there are other factors that may
 * change the result (like a new metadata chunk).
2118
 *
2119
 * If metadata is exhausted, f_bavail will be 0.
2120
 */
C
Chris Mason 已提交
2121 2122
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
2123 2124
	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
	struct btrfs_super_block *disk_super = fs_info->super_copy;
2125 2126
	struct btrfs_space_info *found;
	u64 total_used = 0;
2127
	u64 total_free_data = 0;
2128
	u64 total_free_meta = 0;
2129
	int bits = dentry->d_sb->s_blocksize_bits;
2130
	__be32 *fsid = (__be32 *)fs_info->fs_devices->fsid;
2131 2132
	unsigned factor = 1;
	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
2133
	int ret;
2134
	u64 thresh = 0;
2135
	int mixed = 0;
C
Chris Mason 已提交
2136

2137
	rcu_read_lock();
2138
	list_for_each_entry_rcu(found, &fs_info->space_info, list) {
2139
		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
2140 2141
			int i;

2142 2143 2144
			total_free_data += found->disk_total - found->disk_used;
			total_free_data -=
				btrfs_account_ro_block_groups_free_space(found);
2145 2146

			for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2147 2148 2149
				if (!list_empty(&found->block_groups[i]))
					factor = btrfs_bg_type_to_factor(
						btrfs_raid_array[i].bg_flag);
2150
			}
2151
		}
2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162

		/*
		 * Metadata in mixed block goup profiles are accounted in data
		 */
		if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
			if (found->flags & BTRFS_BLOCK_GROUP_DATA)
				mixed = 1;
			else
				total_free_meta += found->disk_total -
					found->disk_used;
		}
2163

2164
		total_used += found->disk_used;
J
Josef Bacik 已提交
2165
	}
2166

2167 2168
	rcu_read_unlock();

2169 2170 2171 2172 2173 2174
	buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor);
	buf->f_blocks >>= bits;
	buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits);

	/* Account global block reserve as used, it's in logical size already */
	spin_lock(&block_rsv->lock);
2175 2176 2177 2178 2179
	/* Mixed block groups accounting is not byte-accurate, avoid overflow */
	if (buf->f_bfree >= block_rsv->size >> bits)
		buf->f_bfree -= block_rsv->size >> bits;
	else
		buf->f_bfree = 0;
2180 2181
	spin_unlock(&block_rsv->lock);

2182
	buf->f_bavail = div_u64(total_free_data, factor);
2183
	ret = btrfs_calc_avail_data_space(fs_info, &total_free_data);
2184
	if (ret)
2185
		return ret;
2186
	buf->f_bavail += div_u64(total_free_data, factor);
2187
	buf->f_bavail = buf->f_bavail >> bits;
C
Chris Mason 已提交
2188

2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
	/*
	 * We calculate the remaining metadata space minus global reserve. If
	 * this is (supposedly) smaller than zero, there's no space. But this
	 * does not hold in practice, the exhausted state happens where's still
	 * some positive delta. So we apply some guesswork and compare the
	 * delta to a 4M threshold.  (Practically observed delta was ~2M.)
	 *
	 * We probably cannot calculate the exact threshold value because this
	 * depends on the internal reservations requested by various
	 * operations, so some operations that consume a few metadata will
	 * succeed even if the Avail is zero. But this is better than the other
	 * way around.
	 */
2202
	thresh = SZ_4M;
2203

2204 2205 2206 2207 2208 2209 2210 2211 2212
	/*
	 * We only want to claim there's no available space if we can no longer
	 * allocate chunks for our metadata profile and our global reserve will
	 * not fit in the free metadata space.  If we aren't ->full then we
	 * still can allocate chunks and thus are fine using the currently
	 * calculated f_bavail.
	 */
	if (!mixed && block_rsv->space_info->full &&
	    total_free_meta - thresh < block_rsv->size)
2213 2214
		buf->f_bavail = 0;

2215 2216 2217 2218
	buf->f_type = BTRFS_SUPER_MAGIC;
	buf->f_bsize = dentry->d_sb->s_blocksize;
	buf->f_namelen = BTRFS_NAME_LEN;

2219
	/* We treat it as constant endianness (it doesn't matter _which_)
C
Chris Mason 已提交
2220
	   because we want the fsid to come out the same whether mounted
2221 2222 2223
	   on a big-endian or little-endian host */
	buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
	buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
2224
	/* Mask in the root object ID too, to disambiguate subvols */
2225 2226 2227 2228
	buf->f_fsid.val[0] ^=
		BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
	buf->f_fsid.val[1] ^=
		BTRFS_I(d_inode(dentry))->root->root_key.objectid;
2229

C
Chris Mason 已提交
2230 2231
	return 0;
}
C
Chris Mason 已提交
2232

A
Al Viro 已提交
2233 2234
static void btrfs_kill_super(struct super_block *sb)
{
2235
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
A
Al Viro 已提交
2236
	kill_anon_super(sb);
2237
	btrfs_free_fs_info(fs_info);
A
Al Viro 已提交
2238 2239
}

2240 2241 2242
static struct file_system_type btrfs_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "btrfs",
A
Al Viro 已提交
2243
	.mount		= btrfs_mount,
A
Al Viro 已提交
2244
	.kill_sb	= btrfs_kill_super,
2245
	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
2246
};
2247 2248 2249 2250 2251 2252 2253 2254 2255

static struct file_system_type btrfs_root_fs_type = {
	.owner		= THIS_MODULE,
	.name		= "btrfs",
	.mount		= btrfs_mount_root,
	.kill_sb	= btrfs_kill_super,
	.fs_flags	= FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
};

2256
MODULE_ALIAS_FS("btrfs");
2257

2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268
static int btrfs_control_open(struct inode *inode, struct file *file)
{
	/*
	 * The control file's private_data is used to hold the
	 * transaction when it is started and is used to keep
	 * track of whether a transaction is already in progress.
	 */
	file->private_data = NULL;
	return 0;
}

C
Chris Mason 已提交
2269
/*
2270
 * Used by /dev/btrfs-control for devices ioctls.
C
Chris Mason 已提交
2271
 */
2272 2273 2274 2275
static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
				unsigned long arg)
{
	struct btrfs_ioctl_vol_args *vol;
2276
	struct btrfs_device *device = NULL;
2277
	int ret = -ENOTTY;
2278

2279 2280 2281
	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

L
Li Zefan 已提交
2282 2283 2284
	vol = memdup_user((void __user *)arg, sizeof(*vol));
	if (IS_ERR(vol))
		return PTR_ERR(vol);
2285
	vol->name[BTRFS_PATH_NAME_MAX] = '\0';
2286

2287 2288
	switch (cmd) {
	case BTRFS_IOC_SCAN_DEV:
2289
		mutex_lock(&uuid_mutex);
2290 2291 2292
		device = btrfs_scan_one_device(vol->name, FMODE_READ,
					       &btrfs_root_fs_type);
		ret = PTR_ERR_OR_ZERO(device);
2293
		mutex_unlock(&uuid_mutex);
2294
		break;
2295 2296 2297
	case BTRFS_IOC_FORGET_DEV:
		ret = btrfs_forget_devices(vol->name);
		break;
J
Josef Bacik 已提交
2298
	case BTRFS_IOC_DEVICES_READY:
2299
		mutex_lock(&uuid_mutex);
2300 2301 2302
		device = btrfs_scan_one_device(vol->name, FMODE_READ,
					       &btrfs_root_fs_type);
		if (IS_ERR(device)) {
2303
			mutex_unlock(&uuid_mutex);
2304
			ret = PTR_ERR(device);
J
Josef Bacik 已提交
2305
			break;
2306
		}
2307 2308
		ret = !(device->fs_devices->num_devices ==
			device->fs_devices->total_devices);
2309
		mutex_unlock(&uuid_mutex);
J
Josef Bacik 已提交
2310
		break;
2311
	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
2312
		ret = btrfs_ioctl_get_supported_features((void __user*)arg);
2313
		break;
2314
	}
L
Li Zefan 已提交
2315

2316
	kfree(vol);
L
Linda Knippers 已提交
2317
	return ret;
2318 2319
}

2320
static int btrfs_freeze(struct super_block *sb)
Y
Yan 已提交
2321
{
2322
	struct btrfs_trans_handle *trans;
2323 2324
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *root = fs_info->tree_root;
2325

2326
	set_bit(BTRFS_FS_FROZEN, &fs_info->flags);
2327 2328 2329 2330 2331 2332
	/*
	 * We don't need a barrier here, we'll wait for any transaction that
	 * could be in progress on other threads (and do delayed iputs that
	 * we want to avoid on a frozen filesystem), or do the commit
	 * ourselves.
	 */
M
Miao Xie 已提交
2333
	trans = btrfs_attach_transaction_barrier(root);
2334 2335 2336 2337 2338 2339
	if (IS_ERR(trans)) {
		/* no transaction, don't bother */
		if (PTR_ERR(trans) == -ENOENT)
			return 0;
		return PTR_ERR(trans);
	}
2340
	return btrfs_commit_transaction(trans);
Y
Yan 已提交
2341 2342
}

2343 2344
static int btrfs_unfreeze(struct super_block *sb)
{
2345 2346 2347
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);

	clear_bit(BTRFS_FS_FROZEN, &fs_info->flags);
2348 2349 2350
	return 0;
}

J
Josef Bacik 已提交
2351 2352 2353 2354 2355 2356 2357
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
	struct btrfs_fs_devices *cur_devices;
	struct btrfs_device *dev, *first_dev = NULL;
	struct list_head *head;

2358 2359 2360 2361 2362
	/*
	 * Lightweight locking of the devices. We should not need
	 * device_list_mutex here as we only read the device data and the list
	 * is protected by RCU.  Even if a device is deleted during the list
	 * traversals, we'll get valid data, the freeing callback will wait at
2363
	 * least until the rcu_read_unlock.
2364 2365
	 */
	rcu_read_lock();
J
Josef Bacik 已提交
2366 2367 2368
	cur_devices = fs_info->fs_devices;
	while (cur_devices) {
		head = &cur_devices->devices;
2369
		list_for_each_entry_rcu(dev, head, dev_list) {
2370
			if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
2371
				continue;
2372 2373
			if (!dev->name)
				continue;
J
Josef Bacik 已提交
2374 2375 2376 2377 2378 2379
			if (!first_dev || dev->devid < first_dev->devid)
				first_dev = dev;
		}
		cur_devices = cur_devices->seed;
	}

2380 2381 2382
	if (first_dev)
		seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\");
	else
J
Josef Bacik 已提交
2383
		WARN_ON(1);
2384
	rcu_read_unlock();
J
Josef Bacik 已提交
2385 2386 2387
	return 0;
}

2388
static const struct super_operations btrfs_super_ops = {
2389
	.drop_inode	= btrfs_drop_inode,
A
Al Viro 已提交
2390
	.evict_inode	= btrfs_evict_inode,
C
Chris Mason 已提交
2391
	.put_super	= btrfs_put_super,
2392
	.sync_fs	= btrfs_sync_fs,
E
Eric Paris 已提交
2393
	.show_options	= btrfs_show_options,
J
Josef Bacik 已提交
2394
	.show_devname	= btrfs_show_devname,
C
Chris Mason 已提交
2395 2396
	.alloc_inode	= btrfs_alloc_inode,
	.destroy_inode	= btrfs_destroy_inode,
A
Al Viro 已提交
2397
	.free_inode	= btrfs_free_inode,
C
Chris Mason 已提交
2398
	.statfs		= btrfs_statfs,
Y
Yan Zheng 已提交
2399
	.remount_fs	= btrfs_remount,
2400
	.freeze_fs	= btrfs_freeze,
2401
	.unfreeze_fs	= btrfs_unfreeze,
C
Chris Mason 已提交
2402
};
2403 2404

static const struct file_operations btrfs_ctl_fops = {
2405
	.open = btrfs_control_open,
2406
	.unlocked_ioctl	 = btrfs_control_ioctl,
2407
	.compat_ioctl = compat_ptr_ioctl,
2408
	.owner	 = THIS_MODULE,
2409
	.llseek = noop_llseek,
2410 2411 2412
};

static struct miscdevice btrfs_misc = {
2413
	.minor		= BTRFS_MINOR,
2414 2415 2416 2417
	.name		= "btrfs-control",
	.fops		= &btrfs_ctl_fops
};

2418 2419 2420
MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
MODULE_ALIAS("devname:btrfs-control");

2421
static int __init btrfs_interface_init(void)
2422 2423 2424 2425
{
	return misc_register(&btrfs_misc);
}

2426
static __cold void btrfs_interface_exit(void)
2427
{
2428
	misc_deregister(&btrfs_misc);
2429 2430
}

2431
static void __init btrfs_print_mod_info(void)
2432
{
2433
	static const char options[] = ""
2434 2435 2436
#ifdef CONFIG_BTRFS_DEBUG
			", debug=on"
#endif
2437 2438 2439
#ifdef CONFIG_BTRFS_ASSERT
			", assert=on"
#endif
2440 2441
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
			", integrity-checker=on"
J
Josef Bacik 已提交
2442 2443 2444
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
			", ref-verify=on"
2445
#endif
2446 2447
			;
	pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
2448 2449
}

2450 2451
static int __init init_btrfs_fs(void)
{
C
Chris Mason 已提交
2452
	int err;
2453

2454 2455
	btrfs_props_init();

2456 2457
	err = btrfs_init_sysfs();
	if (err)
2458
		return err;
2459

2460
	btrfs_init_compress();
2461

2462 2463 2464 2465
	err = btrfs_init_cachep();
	if (err)
		goto free_compress;

2466
	err = extent_io_init();
2467 2468 2469
	if (err)
		goto free_cachep;

2470
	err = extent_state_cache_init();
2471 2472 2473
	if (err)
		goto free_extent_io;

2474 2475 2476 2477
	err = extent_map_init();
	if (err)
		goto free_extent_state_cache;

2478
	err = ordered_data_init();
2479 2480
	if (err)
		goto free_extent_map;
C
Chris Mason 已提交
2481

2482 2483 2484 2485
	err = btrfs_delayed_inode_init();
	if (err)
		goto free_ordered_data;

2486
	err = btrfs_auto_defrag_init();
2487 2488 2489
	if (err)
		goto free_delayed_inode;

2490
	err = btrfs_delayed_ref_init();
2491 2492 2493
	if (err)
		goto free_auto_defrag;

2494 2495
	err = btrfs_prelim_ref_init();
	if (err)
2496
		goto free_delayed_ref;
2497

2498
	err = btrfs_end_io_wq_init();
2499
	if (err)
2500
		goto free_prelim_ref;
2501

2502 2503 2504 2505
	err = btrfs_interface_init();
	if (err)
		goto free_end_io_wq;

2506 2507
	btrfs_init_lockdep();

2508
	btrfs_print_mod_info();
2509 2510 2511 2512 2513 2514 2515 2516

	err = btrfs_run_sanity_tests();
	if (err)
		goto unregister_ioctl;

	err = register_filesystem(&btrfs_fs_type);
	if (err)
		goto unregister_ioctl;
2517

2518 2519
	return 0;

2520 2521
unregister_ioctl:
	btrfs_interface_exit();
2522 2523
free_end_io_wq:
	btrfs_end_io_wq_exit();
2524 2525
free_prelim_ref:
	btrfs_prelim_ref_exit();
2526 2527
free_delayed_ref:
	btrfs_delayed_ref_exit();
2528 2529
free_auto_defrag:
	btrfs_auto_defrag_exit();
2530 2531
free_delayed_inode:
	btrfs_delayed_inode_exit();
2532 2533
free_ordered_data:
	ordered_data_exit();
2534 2535
free_extent_map:
	extent_map_exit();
2536 2537
free_extent_state_cache:
	extent_state_cache_exit();
2538 2539
free_extent_io:
	extent_io_exit();
2540 2541
free_cachep:
	btrfs_destroy_cachep();
2542 2543
free_compress:
	btrfs_exit_compress();
2544
	btrfs_exit_sysfs();
2545

2546
	return err;
2547 2548 2549 2550
}

static void __exit exit_btrfs_fs(void)
{
C
Chris Mason 已提交
2551
	btrfs_destroy_cachep();
2552
	btrfs_delayed_ref_exit();
2553
	btrfs_auto_defrag_exit();
2554
	btrfs_delayed_inode_exit();
2555
	btrfs_prelim_ref_exit();
2556
	ordered_data_exit();
2557
	extent_map_exit();
2558
	extent_state_cache_exit();
2559
	extent_io_exit();
2560
	btrfs_interface_exit();
2561
	btrfs_end_io_wq_exit();
2562
	unregister_filesystem(&btrfs_fs_type);
2563
	btrfs_exit_sysfs();
2564
	btrfs_cleanup_fs_uuids();
2565
	btrfs_exit_compress();
2566 2567
}

2568
late_initcall(init_btrfs_fs);
2569 2570 2571
module_exit(exit_btrfs_fs)

MODULE_LICENSE("GPL");
2572
MODULE_SOFTDEP("pre: crc32c");
2573
MODULE_SOFTDEP("pre: xxhash64");
2574
MODULE_SOFTDEP("pre: sha256");
2575
MODULE_SOFTDEP("pre: blake2b-256");