disk-io.c 135.3 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

C
Chris Mason 已提交
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
C
Chris Mason 已提交
9
#include <linux/writeback.h>
10
#include <linux/workqueue.h>
11
#include <linux/kthread.h>
12
#include <linux/slab.h>
13
#include <linux/migrate.h>
14
#include <linux/ratelimit.h>
15
#include <linux/uuid.h>
S
Stefan Behrens 已提交
16
#include <linux/semaphore.h>
17
#include <linux/error-injection.h>
18
#include <linux/crc32c.h>
19
#include <linux/sched/mm.h>
20
#include <asm/unaligned.h>
21
#include <crypto/hash.h>
22 23
#include "ctree.h"
#include "disk-io.h"
24
#include "transaction.h"
25
#include "btrfs_inode.h"
26
#include "volumes.h"
27
#include "print-tree.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "free-space-cache.h"
31
#include "free-space-tree.h"
32
#include "check-integrity.h"
33
#include "rcu-string.h"
34
#include "dev-replace.h"
D
David Woodhouse 已提交
35
#include "raid56.h"
36
#include "sysfs.h"
J
Josef Bacik 已提交
37
#include "qgroup.h"
38
#include "compression.h"
39
#include "tree-checker.h"
J
Josef Bacik 已提交
40
#include "ref-verify.h"
41
#include "block-group.h"
42
#include "discard.h"
43
#include "space-info.h"
N
Naohiro Aota 已提交
44
#include "zoned.h"
45

46 47 48 49
#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
				 BTRFS_HEADER_FLAG_RELOC |\
				 BTRFS_SUPER_FLAG_ERROR |\
				 BTRFS_SUPER_FLAG_SEEDING |\
50 51
				 BTRFS_SUPER_FLAG_METADUMP |\
				 BTRFS_SUPER_FLAG_METADUMP_V2)
52

53
static void end_workqueue_fn(struct btrfs_work *work);
54
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
L
liubo 已提交
55
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
56
				      struct btrfs_fs_info *fs_info);
57
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
58
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
L
liubo 已提交
59 60
					struct extent_io_tree *dirty_pages,
					int mark);
61
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
L
liubo 已提交
62
				       struct extent_io_tree *pinned_extents);
63 64
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
65

C
Chris Mason 已提交
66
/*
67 68
 * btrfs_end_io_wq structs are used to do processing in task context when an IO
 * is complete.  This is used during reads to verify checksums, and it is used
C
Chris Mason 已提交
69 70
 * by writes to insert metadata for new file extents after IO is complete.
 */
71
struct btrfs_end_io_wq {
72 73 74 75
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
76
	blk_status_t status;
77
	enum btrfs_wq_endio_type metadata;
78
	struct btrfs_work work;
79
};
80

81 82 83 84 85 86 87
static struct kmem_cache *btrfs_end_io_wq_cache;

int __init btrfs_end_io_wq_init(void)
{
	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
					sizeof(struct btrfs_end_io_wq),
					0,
88
					SLAB_MEM_SPREAD,
89 90 91 92 93 94
					NULL);
	if (!btrfs_end_io_wq_cache)
		return -ENOMEM;
	return 0;
}

95
void __cold btrfs_end_io_wq_exit(void)
96
{
97
	kmem_cache_destroy(btrfs_end_io_wq_cache);
98 99
}

100 101 102 103 104 105
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
	if (fs_info->csum_shash)
		crypto_free_shash(fs_info->csum_shash);
}

C
Chris Mason 已提交
106 107 108 109 110
/*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
 * just before they are sent down the IO stack.
 */
111
struct async_submit_bio {
112
	struct inode *inode;
113
	struct bio *bio;
114
	extent_submit_bio_start_t *submit_bio_start;
115
	int mirror_num;
116 117 118

	/* Optional parameter for submit_bio_start used by direct io */
	u64 dio_file_offset;
119
	struct btrfs_work work;
120
	blk_status_t status;
121 122
};

123 124 125 126 127 128 129 130
/*
 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 * the level the eb occupies in the tree.
 *
 * Different roots are used for different purposes and may nest inside each
 * other and they require separate keysets.  As lockdep keys should be
 * static, assign keysets according to the purpose of the root as indicated
131 132
 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 * roots have separate keysets.
133
 *
134 135 136
 * Lock-nesting across peer nodes is always done with the immediate parent
 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 * subclass to avoid triggering lockdep warning in such cases.
137
 *
138 139 140
 * The key is set by the readpage_end_io_hook after the buffer has passed
 * csum validation but before the pages are unlocked.  It is also set by
 * btrfs_init_new_buffer on freshly allocated blocks.
141
 *
142 143 144
 * We also add a check to make sure the highest level of the tree is the
 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 * needs update as well.
145 146 147 148 149
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
#  error
# endif
150

151 152 153 154 155 156 157 158 159 160 161 162 163
#define DEFINE_LEVEL(stem, level)					\
	.names[level] = "btrfs-" stem "-0" #level,

#define DEFINE_NAME(stem)						\
	DEFINE_LEVEL(stem, 0)						\
	DEFINE_LEVEL(stem, 1)						\
	DEFINE_LEVEL(stem, 2)						\
	DEFINE_LEVEL(stem, 3)						\
	DEFINE_LEVEL(stem, 4)						\
	DEFINE_LEVEL(stem, 5)						\
	DEFINE_LEVEL(stem, 6)						\
	DEFINE_LEVEL(stem, 7)

164 165
static struct btrfs_lockdep_keyset {
	u64			id;		/* root objectid */
166
	/* Longest entry: btrfs-free-space-00 */
167 168
	char			names[BTRFS_MAX_LEVEL][20];
	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
169
} btrfs_lockdep_keysets[] = {
170 171 172 173 174 175 176 177 178 179 180 181
	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	DEFINE_NAME("extent")	},
	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	DEFINE_NAME("chunk")	},
	{ .id = BTRFS_DEV_TREE_OBJECTID,	DEFINE_NAME("dev")	},
	{ .id = BTRFS_CSUM_TREE_OBJECTID,	DEFINE_NAME("csum")	},
	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	DEFINE_NAME("quota")	},
	{ .id = BTRFS_TREE_LOG_OBJECTID,	DEFINE_NAME("log")	},
	{ .id = BTRFS_TREE_RELOC_OBJECTID,	DEFINE_NAME("treloc")	},
	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	DEFINE_NAME("dreloc")	},
	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
	{ .id = 0,				DEFINE_NAME("tree")	},
182
};
183

184 185
#undef DEFINE_LEVEL
#undef DEFINE_NAME
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202

void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				    int level)
{
	struct btrfs_lockdep_keyset *ks;

	BUG_ON(level >= ARRAY_SIZE(ks->keys));

	/* find the matching keyset, id 0 is the default entry */
	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
		if (ks->id == objectid)
			break;

	lockdep_set_class_and_name(&eb->lock,
				   &ks->keys[level], ks->names[level]);
}

203 204
#endif

C
Chris Mason 已提交
205
/*
206
 * Compute the csum of a btree block and store the result to provided buffer.
C
Chris Mason 已提交
207
 */
208
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
209
{
210
	struct btrfs_fs_info *fs_info = buf->fs_info;
211
	const int num_pages = fs_info->nodesize >> PAGE_SHIFT;
212
	const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
213
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
214
	char *kaddr;
215
	int i;
216 217 218

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
219
	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
220
	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
221
			    first_page_part - BTRFS_CSUM_SIZE);
222

223 224 225
	for (i = 1; i < num_pages; i++) {
		kaddr = page_address(buf->pages[i]);
		crypto_shash_update(shash, kaddr, PAGE_SIZE);
226
	}
227
	memset(result, 0, BTRFS_CSUM_SIZE);
228
	crypto_shash_final(shash, result);
229 230
}

C
Chris Mason 已提交
231 232 233 234 235 236
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
237
static int verify_parent_transid(struct extent_io_tree *io_tree,
238 239
				 struct extent_buffer *eb, u64 parent_transid,
				 int atomic)
240
{
241
	struct extent_state *cached_state = NULL;
242
	int ret;
243
	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
244 245 246 247

	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 0;

248 249 250
	if (atomic)
		return -EAGAIN;

251
	if (need_lock)
252 253
		btrfs_tree_read_lock(eb);

254
	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
255
			 &cached_state);
256
	if (extent_buffer_uptodate(eb) &&
257 258 259 260
	    btrfs_header_generation(eb) == parent_transid) {
		ret = 0;
		goto out;
	}
261 262 263
	btrfs_err_rl(eb->fs_info,
		"parent transid verify failed on %llu wanted %llu found %llu",
			eb->start,
264
			parent_transid, btrfs_header_generation(eb));
265
	ret = 1;
266 267 268 269

	/*
	 * Things reading via commit roots that don't have normal protection,
	 * like send, can have a really old block in cache that may point at a
270
	 * block that has been freed and re-allocated.  So don't clear uptodate
271 272 273 274 275 276
	 * if we find an eb that is under IO (dirty/writeback) because we could
	 * end up reading in the stale data and then writing it back out and
	 * making everybody very sad.
	 */
	if (!extent_buffer_under_io(eb))
		clear_extent_buffer_uptodate(eb);
C
Chris Mason 已提交
277
out:
278
	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
279
			     &cached_state);
280
	if (need_lock)
281
		btrfs_tree_read_unlock(eb);
282 283 284
	return ret;
}

285 286 287 288
static bool btrfs_supported_super_csum(u16 csum_type)
{
	switch (csum_type) {
	case BTRFS_CSUM_TYPE_CRC32:
289
	case BTRFS_CSUM_TYPE_XXHASH:
290
	case BTRFS_CSUM_TYPE_SHA256:
291
	case BTRFS_CSUM_TYPE_BLAKE2:
292 293 294 295 296 297
		return true;
	default:
		return false;
	}
}

D
David Sterba 已提交
298 299 300 301
/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
302 303
static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
				  char *raw_disk_sb)
D
David Sterba 已提交
304 305 306
{
	struct btrfs_super_block *disk_sb =
		(struct btrfs_super_block *)raw_disk_sb;
307
	char result[BTRFS_CSUM_SIZE];
308 309 310
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);

	shash->tfm = fs_info->csum_shash;
D
David Sterba 已提交
311

312 313 314 315 316
	/*
	 * The super_block structure does not span the whole
	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
	 * filled with zeros and is included in the checksum.
	 */
317 318
	crypto_shash_digest(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
D
David Sterba 已提交
319

320
	if (memcmp(disk_sb->csum, result, fs_info->csum_size))
321
		return 1;
D
David Sterba 已提交
322

323
	return 0;
D
David Sterba 已提交
324 325
}

326
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
327
			   struct btrfs_key *first_key, u64 parent_transid)
328
{
329
	struct btrfs_fs_info *fs_info = eb->fs_info;
330 331 332 333 334 335
	int found_level;
	struct btrfs_key found_key;
	int ret;

	found_level = btrfs_header_level(eb);
	if (found_level != level) {
336 337
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree level check failed\n");
338 339 340 341 342 343 344 345 346
		btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
			  eb->start, level, found_level);
		return -EIO;
	}

	if (!first_key)
		return 0;

347 348 349 350 351 352 353 354
	/*
	 * For live tree block (new tree blocks in current transaction),
	 * we need proper lock context to avoid race, which is impossible here.
	 * So we only checks tree blocks which is read from disk, whose
	 * generation <= fs_info->last_trans_committed.
	 */
	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
		return 0;
355 356 357 358 359 360 361 362 363 364

	/* We have @first_key, so this @eb must have at least one item */
	if (btrfs_header_nritems(eb) == 0) {
		btrfs_err(fs_info,
		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
			  eb->start);
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
		return -EUCLEAN;
	}

365 366 367 368 369 370 371
	if (found_level)
		btrfs_node_key_to_cpu(eb, &found_key, 0);
	else
		btrfs_item_key_to_cpu(eb, &found_key, 0);
	ret = btrfs_comp_cpu_keys(first_key, &found_key);

	if (ret) {
372 373
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree first key check failed\n");
374
		btrfs_err(fs_info,
375 376 377 378 379
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
			  eb->start, parent_transid, first_key->objectid,
			  first_key->type, first_key->offset,
			  found_key.objectid, found_key.type,
			  found_key.offset);
380 381 382 383
	}
	return ret;
}

C
Chris Mason 已提交
384 385 386
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
387 388 389 390
 *
 * @parent_transid:	expected transid, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key of first slot, skip check if NULL
C
Chris Mason 已提交
391
 */
392
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
393 394
					  u64 parent_transid, int level,
					  struct btrfs_key *first_key)
395
{
396
	struct btrfs_fs_info *fs_info = eb->fs_info;
397
	struct extent_io_tree *io_tree;
398
	int failed = 0;
399 400 401
	int ret;
	int num_copies = 0;
	int mirror_num = 0;
402
	int failed_mirror = 0;
403

404
	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
405
	while (1) {
406
		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
407
		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
408
		if (!ret) {
409
			if (verify_parent_transid(io_tree, eb,
410
						   parent_transid, 0))
411
				ret = -EIO;
412
			else if (btrfs_verify_level_key(eb, level,
413
						first_key, parent_transid))
414 415 416
				ret = -EUCLEAN;
			else
				break;
417
		}
C
Chris Mason 已提交
418

419
		num_copies = btrfs_num_copies(fs_info,
420
					      eb->start, eb->len);
C
Chris Mason 已提交
421
		if (num_copies == 1)
422
			break;
C
Chris Mason 已提交
423

424 425 426 427 428
		if (!failed_mirror) {
			failed = 1;
			failed_mirror = eb->read_mirror;
		}

429
		mirror_num++;
430 431 432
		if (mirror_num == failed_mirror)
			mirror_num++;

C
Chris Mason 已提交
433
		if (mirror_num > num_copies)
434
			break;
435
	}
436

437
	if (failed && !ret && failed_mirror)
438
		btrfs_repair_eb_io_failure(eb, failed_mirror);
439 440

	return ret;
441
}
442

C
Chris Mason 已提交
443
/*
444 445 446
 * Checksum a dirty tree block before IO.  This has extra checks to make sure
 * we only fill in the checksum field in the first page of a multi-page block.
 * For subpage extent buffers we need bvec to also read the offset in the page.
C
Chris Mason 已提交
447
 */
448
static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec)
449
{
450
	struct page *page = bvec->bv_page;
M
Miao Xie 已提交
451
	u64 start = page_offset(page);
452
	u64 found_start;
453
	u8 result[BTRFS_CSUM_SIZE];
454
	struct extent_buffer *eb;
455
	int ret;
456

J
Josef Bacik 已提交
457 458 459
	eb = (struct extent_buffer *)page->private;
	if (page != eb->pages[0])
		return 0;
460

461
	found_start = btrfs_header_bytenr(eb);
462 463 464 465 466 467

	if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
		WARN_ON(found_start != 0);
		return 0;
	}

468 469 470 471 472 473 474 475 476
	/*
	 * Please do not consolidate these warnings into a single if.
	 * It is useful to know what went wrong.
	 */
	if (WARN_ON(found_start != start))
		return -EUCLEAN;
	if (WARN_ON(!PageUptodate(page)))
		return -EUCLEAN;

477
	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
478 479
				    offsetof(struct btrfs_header, fsid),
				    BTRFS_FSID_SIZE) == 0);
480

481
	csum_tree_block(eb, result);
482

483 484 485 486 487 488
	if (btrfs_header_level(eb))
		ret = btrfs_check_node(eb);
	else
		ret = btrfs_check_leaf_full(eb);

	if (ret < 0) {
489
		btrfs_print_tree(eb, 0);
490 491 492
		btrfs_err(fs_info,
		"block=%llu write time tree block corruption detected",
			  eb->start);
493
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
494 495
		return ret;
	}
496
	write_extent_buffer(eb, result, 0, fs_info->csum_size);
497

498
	return 0;
499 500
}

501
static int check_tree_block_fsid(struct extent_buffer *eb)
Y
Yan Zheng 已提交
502
{
503
	struct btrfs_fs_info *fs_info = eb->fs_info;
504
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
505
	u8 fsid[BTRFS_FSID_SIZE];
506
	u8 *metadata_uuid;
Y
Yan Zheng 已提交
507

508 509
	read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
			   BTRFS_FSID_SIZE);
510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
	/*
	 * Checking the incompat flag is only valid for the current fs. For
	 * seed devices it's forbidden to have their uuid changed so reading
	 * ->fsid in this case is fine
	 */
	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
		metadata_uuid = fs_devices->metadata_uuid;
	else
		metadata_uuid = fs_devices->fsid;

	if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE))
		return 0;

	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
		if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
			return 0;

	return 1;
Y
Yan Zheng 已提交
528 529
}

530 531
/* Do basic extent buffer checks at read time */
static int validate_extent_buffer(struct extent_buffer *eb)
532
{
533
	struct btrfs_fs_info *fs_info = eb->fs_info;
534
	u64 found_start;
535 536
	const u32 csum_size = fs_info->csum_size;
	u8 found_level;
537
	u8 result[BTRFS_CSUM_SIZE];
538
	int ret = 0;
539

540
	found_start = btrfs_header_bytenr(eb);
541
	if (found_start != eb->start) {
542 543
		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
			     eb->start, found_start);
544
		ret = -EIO;
545
		goto out;
546
	}
547
	if (check_tree_block_fsid(eb)) {
548 549
		btrfs_err_rl(fs_info, "bad fsid on block %llu",
			     eb->start);
550
		ret = -EIO;
551
		goto out;
552
	}
553
	found_level = btrfs_header_level(eb);
554
	if (found_level >= BTRFS_MAX_LEVEL) {
555 556
		btrfs_err(fs_info, "bad tree block level %d on %llu",
			  (int)btrfs_header_level(eb), eb->start);
557
		ret = -EIO;
558
		goto out;
559
	}
560

561
	csum_tree_block(eb, result);
562

563
	if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
564
		u8 val[BTRFS_CSUM_SIZE] = { 0 };
565 566 567

		read_extent_buffer(eb, &val, 0, csum_size);
		btrfs_warn_rl(fs_info,
568
	"%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
569
			      fs_info->sb->s_id, eb->start,
570 571 572
			      CSUM_FMT_VALUE(csum_size, val),
			      CSUM_FMT_VALUE(csum_size, result),
			      btrfs_header_level(eb));
573
		ret = -EUCLEAN;
574
		goto out;
575 576
	}

577 578 579 580 581
	/*
	 * If this is a leaf block and it is corrupt, set the corrupt bit so
	 * that we don't try and read the other copies of this block, just
	 * return -EIO.
	 */
582
	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
583 584 585
		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
		ret = -EIO;
	}
586

587
	if (found_level > 0 && btrfs_check_node(eb))
L
Liu Bo 已提交
588 589
		ret = -EIO;

590 591
	if (!ret)
		set_extent_buffer_uptodate(eb);
592 593 594 595
	else
		btrfs_err(fs_info,
			  "block=%llu read time tree block corruption detected",
			  eb->start);
596 597 598 599
out:
	return ret;
}

600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
				   int mirror)
{
	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
	struct extent_buffer *eb;
	bool reads_done;
	int ret = 0;

	/*
	 * We don't allow bio merge for subpage metadata read, so we should
	 * only get one eb for each endio hook.
	 */
	ASSERT(end == start + fs_info->nodesize - 1);
	ASSERT(PagePrivate(page));

	eb = find_extent_buffer(fs_info, start);
	/*
	 * When we are reading one tree block, eb must have been inserted into
	 * the radix tree. If not, something is wrong.
	 */
	ASSERT(eb);

	reads_done = atomic_dec_and_test(&eb->io_pages);
	/* Subpage read must finish in page read */
	ASSERT(reads_done);

	eb->read_mirror = mirror;
	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
		ret = -EIO;
		goto err;
	}
	ret = validate_extent_buffer(eb);
	if (ret < 0)
		goto err;

	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
		btree_readahead_hook(eb, ret);

	set_extent_buffer_uptodate(eb);

	free_extent_buffer(eb);
	return ret;
err:
	/*
	 * end_bio_extent_readpage decrements io_pages in case of error,
	 * make sure it has something to decrement.
	 */
	atomic_inc(&eb->io_pages);
	clear_extent_buffer_uptodate(eb);
	free_extent_buffer(eb);
	return ret;
}

653
int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio,
654 655 656 657 658 659 660 661
				   struct page *page, u64 start, u64 end,
				   int mirror)
{
	struct extent_buffer *eb;
	int ret = 0;
	int reads_done;

	ASSERT(page->private);
662 663 664 665

	if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
		return validate_subpage_buffer(page, start, end, mirror);

666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
	eb = (struct extent_buffer *)page->private;

	/*
	 * The pending IO might have been the only thing that kept this buffer
	 * in memory.  Make sure we have a ref for all this other checks
	 */
	atomic_inc(&eb->refs);

	reads_done = atomic_dec_and_test(&eb->io_pages);
	if (!reads_done)
		goto err;

	eb->read_mirror = mirror;
	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
		ret = -EIO;
		goto err;
	}
	ret = validate_extent_buffer(eb);
684
err:
685 686
	if (reads_done &&
	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
687
		btree_readahead_hook(eb, ret);
A
Arne Jansen 已提交
688

D
David Woodhouse 已提交
689 690 691 692 693 694 695
	if (ret) {
		/*
		 * our io error hook is going to dec the io pages
		 * again, we have to make sure it has something
		 * to decrement
		 */
		atomic_inc(&eb->io_pages);
696
		clear_extent_buffer_uptodate(eb);
D
David Woodhouse 已提交
697
	}
698
	free_extent_buffer(eb);
699

700
	return ret;
701 702
}

703
static void end_workqueue_bio(struct bio *bio)
704
{
705
	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
706
	struct btrfs_fs_info *fs_info;
707
	struct btrfs_workqueue *wq;
708 709

	fs_info = end_io_wq->info;
710
	end_io_wq->status = bio->bi_status;
711

712
	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
713
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
714
			wq = fs_info->endio_meta_write_workers;
715
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
716
			wq = fs_info->endio_freespace_worker;
717
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
718
			wq = fs_info->endio_raid56_workers;
719
		else
720
			wq = fs_info->endio_write_workers;
721
	} else {
722
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
723
			wq = fs_info->endio_raid56_workers;
724
		else if (end_io_wq->metadata)
725
			wq = fs_info->endio_meta_workers;
726
		else
727
			wq = fs_info->endio_workers;
728
	}
729

730
	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
731
	btrfs_queue_work(wq, &end_io_wq->work);
732 733
}

734
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
735
			enum btrfs_wq_endio_type metadata)
736
{
737
	struct btrfs_end_io_wq *end_io_wq;
738

739
	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
740
	if (!end_io_wq)
741
		return BLK_STS_RESOURCE;
742 743 744

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
745
	end_io_wq->info = info;
746
	end_io_wq->status = 0;
747
	end_io_wq->bio = bio;
748
	end_io_wq->metadata = metadata;
749 750 751

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;
752 753 754
	return 0;
}

C
Chris Mason 已提交
755 756 757
static void run_one_async_start(struct btrfs_work *work)
{
	struct async_submit_bio *async;
758
	blk_status_t ret;
C
Chris Mason 已提交
759 760

	async = container_of(work, struct  async_submit_bio, work);
761 762
	ret = async->submit_bio_start(async->inode, async->bio,
				      async->dio_file_offset);
763
	if (ret)
764
		async->status = ret;
C
Chris Mason 已提交
765 766
}

767 768 769 770 771 772 773 774
/*
 * In order to insert checksums into the metadata in large chunks, we wait
 * until bio submission time.   All the pages in the bio are checksummed and
 * sums are attached onto the ordered extent record.
 *
 * At IO completion time the csums attached on the ordered extent record are
 * inserted into the tree.
 */
C
Chris Mason 已提交
775
static void run_one_async_done(struct btrfs_work *work)
776 777
{
	struct async_submit_bio *async;
778 779
	struct inode *inode;
	blk_status_t ret;
780 781

	async = container_of(work, struct  async_submit_bio, work);
782
	inode = async->inode;
783

784
	/* If an error occurred we just want to clean up the bio and move on */
785 786
	if (async->status) {
		async->bio->bi_status = async->status;
787
		bio_endio(async->bio);
788 789 790
		return;
	}

791 792 793 794 795 796
	/*
	 * All of the bios that pass through here are from async helpers.
	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
	 * This changes nothing when cgroups aren't in use.
	 */
	async->bio->bi_opf |= REQ_CGROUP_PUNT;
797
	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
798 799 800 801
	if (ret) {
		async->bio->bi_status = ret;
		bio_endio(async->bio);
	}
C
Chris Mason 已提交
802 803 804 805 806 807 808
}

static void run_one_async_free(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
809 810 811
	kfree(async);
}

812
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
813
				 int mirror_num, unsigned long bio_flags,
814
				 u64 dio_file_offset,
815
				 extent_submit_bio_start_t *submit_bio_start)
816
{
817
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
818 819 820 821
	struct async_submit_bio *async;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
822
		return BLK_STS_RESOURCE;
823

824
	async->inode = inode;
825 826
	async->bio = bio;
	async->mirror_num = mirror_num;
C
Chris Mason 已提交
827 828
	async->submit_bio_start = submit_bio_start;

829 830
	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
			run_one_async_free);
C
Chris Mason 已提交
831

832
	async->dio_file_offset = dio_file_offset;
833

834
	async->status = 0;
835

836
	if (op_is_sync(bio->bi_opf))
837
		btrfs_set_work_high_priority(&async->work);
838

839
	btrfs_queue_work(fs_info->workers, &async->work);
840 841 842
	return 0;
}

843
static blk_status_t btree_csum_one_bio(struct bio *bio)
844
{
845
	struct bio_vec *bvec;
846
	struct btrfs_root *root;
847
	int ret = 0;
848
	struct bvec_iter_all iter_all;
849

850
	ASSERT(!bio_flagged(bio, BIO_CLONED));
851
	bio_for_each_segment_all(bvec, bio, iter_all) {
852
		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
853
		ret = csum_dirty_buffer(root->fs_info, bvec);
854 855
		if (ret)
			break;
856
	}
857

858
	return errno_to_blk_status(ret);
859 860
}

861
static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
862
					   u64 dio_file_offset)
863
{
864 865
	/*
	 * when we're called for a write, we're already in the async
866
	 * submission context.  Just jump into btrfs_map_bio
867
	 */
868
	return btree_csum_one_bio(bio);
C
Chris Mason 已提交
869
}
870

871 872
static int check_async_write(struct btrfs_fs_info *fs_info,
			     struct btrfs_inode *bi)
873
{
874 875
	if (btrfs_is_zoned(fs_info))
		return 0;
876 877
	if (atomic_read(&bi->sync_writers))
		return 0;
878
	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
879 880 881 882
		return 0;
	return 1;
}

883 884
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
				       int mirror_num, unsigned long bio_flags)
885
{
886
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
887
	int async = check_async_write(fs_info, BTRFS_I(inode));
888
	blk_status_t ret;
889

890
	if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
C
Chris Mason 已提交
891 892 893 894
		/*
		 * called for a read, do the setup so that checksum validation
		 * can happen in the async kernel threads
		 */
895 896
		ret = btrfs_bio_wq_end_io(fs_info, bio,
					  BTRFS_WQ_ENDIO_METADATA);
897
		if (ret)
898
			goto out_w_error;
899
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
900 901 902
	} else if (!async) {
		ret = btree_csum_one_bio(bio);
		if (ret)
903
			goto out_w_error;
904
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
905 906 907 908 909
	} else {
		/*
		 * kthread helpers are used to submit writes so that
		 * checksumming can happen in parallel across all CPUs
		 */
910 911
		ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
					  0, btree_submit_bio_start);
912
	}
913

914 915 916 917
	if (ret)
		goto out_w_error;
	return 0;

918
out_w_error:
919
	bio->bi_status = ret;
920
	bio_endio(bio);
921
	return ret;
922 923
}

J
Jan Beulich 已提交
924
#ifdef CONFIG_MIGRATION
925
static int btree_migratepage(struct address_space *mapping,
926 927
			struct page *newpage, struct page *page,
			enum migrate_mode mode)
928 929 930 931 932 933 934 935 936 937 938 939 940 941
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
	if (PageDirty(page))
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
	if (page_has_private(page) &&
	    !try_to_release_page(page, GFP_KERNEL))
		return -EAGAIN;
942
	return migrate_page(mapping, newpage, page, mode);
943
}
J
Jan Beulich 已提交
944
#endif
945

946 947 948 949

static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
950 951 952
	struct btrfs_fs_info *fs_info;
	int ret;

953
	if (wbc->sync_mode == WB_SYNC_NONE) {
954 955 956 957

		if (wbc->for_kupdate)
			return 0;

958
		fs_info = BTRFS_I(mapping->host)->root->fs_info;
959
		/* this is a bit racy, but that's ok */
960 961 962
		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
					     BTRFS_DIRTY_METADATA_THRESH,
					     fs_info->dirty_metadata_batch);
963
		if (ret < 0)
964 965
			return 0;
	}
966
	return btree_write_cache_pages(mapping, wbc);
967 968
}

969
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
970
{
971
	if (PageWriteback(page) || PageDirty(page))
C
Chris Mason 已提交
972
		return 0;
973

974
	return try_release_extent_buffer(page);
975 976
}

977 978
static void btree_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
979
{
980 981
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
982 983
	extent_invalidatepage(tree, page, offset);
	btree_releasepage(page, GFP_NOFS);
984
	if (PagePrivate(page)) {
985 986 987
		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
			   "page private not zero on page %llu",
			   (unsigned long long)page_offset(page));
988
		detach_page_private(page);
989
	}
990 991
}

992 993
static int btree_set_page_dirty(struct page *page)
{
994
#ifdef DEBUG
995 996 997 998 999 1000 1001 1002
	struct extent_buffer *eb;

	BUG_ON(!PagePrivate(page));
	eb = (struct extent_buffer *)page->private;
	BUG_ON(!eb);
	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
	BUG_ON(!atomic_read(&eb->refs));
	btrfs_assert_tree_locked(eb);
1003
#endif
1004 1005 1006
	return __set_page_dirty_nobuffers(page);
}

1007
static const struct address_space_operations btree_aops = {
1008
	.writepages	= btree_writepages,
1009 1010
	.releasepage	= btree_releasepage,
	.invalidatepage = btree_invalidatepage,
1011
#ifdef CONFIG_MIGRATION
1012
	.migratepage	= btree_migratepage,
1013
#endif
1014
	.set_page_dirty = btree_set_page_dirty,
1015 1016
};

1017 1018
struct extent_buffer *btrfs_find_create_tree_block(
						struct btrfs_fs_info *fs_info,
1019 1020
						u64 bytenr, u64 owner_root,
						int level)
1021
{
1022 1023
	if (btrfs_is_testing(fs_info))
		return alloc_test_extent_buffer(fs_info, bytenr);
1024
	return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
1025 1026
}

1027 1028 1029 1030
/*
 * Read tree block at logical address @bytenr and do variant basic but critical
 * verification.
 *
1031
 * @owner_root:		the objectid of the root owner for this block.
1032 1033 1034 1035
 * @parent_transid:	expected transid of this tree block, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key in slot 0, skip check if NULL
 */
1036
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1037 1038
				      u64 owner_root, u64 parent_transid,
				      int level, struct btrfs_key *first_key)
1039 1040 1041 1042
{
	struct extent_buffer *buf = NULL;
	int ret;

1043
	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
1044 1045
	if (IS_ERR(buf))
		return buf;
1046

1047
	ret = btree_read_extent_buffer_pages(buf, parent_transid,
1048
					     level, first_key);
1049
	if (ret) {
1050
		free_extent_buffer_stale(buf);
1051
		return ERR_PTR(ret);
1052
	}
1053
	return buf;
1054

1055 1056
}

1057
void btrfs_clean_tree_block(struct extent_buffer *buf)
1058
{
1059
	struct btrfs_fs_info *fs_info = buf->fs_info;
1060
	if (btrfs_header_generation(buf) ==
1061
	    fs_info->running_transaction->transid) {
1062
		btrfs_assert_tree_locked(buf);
1063

1064
		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1065 1066 1067
			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
						 -buf->len,
						 fs_info->dirty_metadata_batch);
1068 1069
			clear_extent_buffer_dirty(buf);
		}
1070
	}
1071 1072
}

1073
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1074
			 u64 objectid)
1075
{
1076
	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1077
	root->fs_info = fs_info;
C
Chris Mason 已提交
1078
	root->node = NULL;
1079
	root->commit_root = NULL;
1080
	root->state = 0;
1081
	root->orphan_cleanup_state = 0;
1082

1083
	root->last_trans = 0;
1084
	root->free_objectid = 0;
1085
	root->nr_delalloc_inodes = 0;
1086
	root->nr_ordered_extents = 0;
1087
	root->inode_tree = RB_ROOT;
1088
	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1089
	root->block_rsv = NULL;
1090 1091

	INIT_LIST_HEAD(&root->dirty_list);
1092
	INIT_LIST_HEAD(&root->root_list);
1093 1094
	INIT_LIST_HEAD(&root->delalloc_inodes);
	INIT_LIST_HEAD(&root->delalloc_root);
1095 1096
	INIT_LIST_HEAD(&root->ordered_extents);
	INIT_LIST_HEAD(&root->ordered_root);
1097
	INIT_LIST_HEAD(&root->reloc_dirty_list);
1098 1099
	INIT_LIST_HEAD(&root->logged_list[0]);
	INIT_LIST_HEAD(&root->logged_list[1]);
1100
	spin_lock_init(&root->inode_lock);
1101
	spin_lock_init(&root->delalloc_lock);
1102
	spin_lock_init(&root->ordered_extent_lock);
1103
	spin_lock_init(&root->accounting_lock);
1104 1105
	spin_lock_init(&root->log_extents_lock[0]);
	spin_lock_init(&root->log_extents_lock[1]);
1106
	spin_lock_init(&root->qgroup_meta_rsv_lock);
1107
	mutex_init(&root->objectid_mutex);
1108
	mutex_init(&root->log_mutex);
1109
	mutex_init(&root->ordered_extent_mutex);
1110
	mutex_init(&root->delalloc_mutex);
1111
	init_waitqueue_head(&root->qgroup_flush_wait);
Y
Yan Zheng 已提交
1112 1113 1114
	init_waitqueue_head(&root->log_writer_wait);
	init_waitqueue_head(&root->log_commit_wait[0]);
	init_waitqueue_head(&root->log_commit_wait[1]);
1115 1116
	INIT_LIST_HEAD(&root->log_ctxs[0]);
	INIT_LIST_HEAD(&root->log_ctxs[1]);
Y
Yan Zheng 已提交
1117 1118 1119
	atomic_set(&root->log_commit[0], 0);
	atomic_set(&root->log_commit[1], 0);
	atomic_set(&root->log_writers, 0);
M
Miao Xie 已提交
1120
	atomic_set(&root->log_batch, 0);
1121
	refcount_set(&root->refs, 1);
1122
	atomic_set(&root->snapshot_force_cow, 0);
1123
	atomic_set(&root->nr_swapfiles, 0);
Y
Yan Zheng 已提交
1124
	root->log_transid = 0;
1125
	root->log_transid_committed = -1;
1126
	root->last_log_commit = 0;
1127
	if (!dummy) {
1128 1129
		extent_io_tree_init(fs_info, &root->dirty_log_pages,
				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
1130 1131 1132
		extent_io_tree_init(fs_info, &root->log_csum_range,
				    IO_TREE_LOG_CSUM_RANGE, NULL);
	}
C
Chris Mason 已提交
1133

1134 1135
	memset(&root->root_key, 0, sizeof(root->root_key));
	memset(&root->root_item, 0, sizeof(root->root_item));
1136
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1137
	root->root_key.objectid = objectid;
1138
	root->anon_dev = 0;
1139

1140
	spin_lock_init(&root->root_item_lock);
1141
	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
J
Josef Bacik 已提交
1142 1143 1144 1145 1146 1147
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&root->leak_list);
	spin_lock(&fs_info->fs_roots_radix_lock);
	list_add_tail(&root->leak_list, &fs_info->allocated_roots);
	spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
1148 1149
}

1150
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1151
					   u64 objectid, gfp_t flags)
A
Al Viro 已提交
1152
{
1153
	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
A
Al Viro 已提交
1154
	if (root)
1155
		__setup_root(root, fs_info, objectid);
A
Al Viro 已提交
1156 1157 1158
	return root;
}

1159 1160
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
1161
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1162 1163 1164
{
	struct btrfs_root *root;

1165 1166 1167
	if (!fs_info)
		return ERR_PTR(-EINVAL);

1168
	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1169 1170
	if (!root)
		return ERR_PTR(-ENOMEM);
1171

1172
	/* We don't use the stripesize in selftest, set it as sectorsize */
1173
	root->alloc_bytenr = 0;
1174 1175 1176 1177 1178

	return root;
}
#endif

1179 1180 1181
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
				     u64 objectid)
{
1182
	struct btrfs_fs_info *fs_info = trans->fs_info;
1183 1184 1185 1186
	struct extent_buffer *leaf;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root;
	struct btrfs_key key;
1187
	unsigned int nofs_flag;
1188 1189
	int ret = 0;

1190 1191 1192 1193 1194
	/*
	 * We're holding a transaction handle, so use a NOFS memory allocation
	 * context to avoid deadlock if reclaim happens.
	 */
	nofs_flag = memalloc_nofs_save();
1195
	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1196
	memalloc_nofs_restore(nofs_flag);
1197 1198 1199 1200 1201 1202 1203
	if (!root)
		return ERR_PTR(-ENOMEM);

	root->root_key.objectid = objectid;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = 0;

1204 1205
	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
				      BTRFS_NESTING_NORMAL);
1206 1207
	if (IS_ERR(leaf)) {
		ret = PTR_ERR(leaf);
1208
		leaf = NULL;
1209
		goto fail_unlock;
1210 1211 1212 1213 1214 1215
	}

	root->node = leaf;
	btrfs_mark_buffer_dirty(leaf);

	root->commit_root = btrfs_root_node(root);
1216
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1217

1218 1219
	btrfs_set_root_flags(&root->root_item, 0);
	btrfs_set_root_limit(&root->root_item, 0);
1220 1221 1222 1223 1224 1225 1226
	btrfs_set_root_bytenr(&root->root_item, leaf->start);
	btrfs_set_root_generation(&root->root_item, trans->transid);
	btrfs_set_root_level(&root->root_item, 0);
	btrfs_set_root_refs(&root->root_item, 1);
	btrfs_set_root_used(&root->root_item, leaf->len);
	btrfs_set_root_last_snapshot(&root->root_item, 0);
	btrfs_set_root_dirid(&root->root_item, 0);
1227
	if (is_fstree(objectid))
1228 1229 1230
		generate_random_guid(root->root_item.uuid);
	else
		export_guid(root->root_item.uuid, &guid_null);
1231
	btrfs_set_root_drop_level(&root->root_item, 0);
1232

1233 1234
	btrfs_tree_unlock(leaf);

1235 1236 1237 1238 1239 1240 1241
	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
	if (ret)
		goto fail;

1242 1243
	return root;

1244
fail_unlock:
1245
	if (leaf)
1246
		btrfs_tree_unlock(leaf);
1247
fail:
1248
	btrfs_put_root(root);
1249

1250
	return ERR_PTR(ret);
1251 1252
}

Y
Yan Zheng 已提交
1253 1254
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
					 struct btrfs_fs_info *fs_info)
1255 1256
{
	struct btrfs_root *root;
Y
Yan Zheng 已提交
1257
	struct extent_buffer *leaf;
1258

1259
	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1260
	if (!root)
Y
Yan Zheng 已提交
1261
		return ERR_PTR(-ENOMEM);
1262 1263 1264 1265

	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1266

Y
Yan Zheng 已提交
1267
	/*
1268
	 * DON'T set SHAREABLE bit for log trees.
1269
	 *
1270 1271 1272 1273 1274
	 * Log trees are not exposed to user space thus can't be snapshotted,
	 * and they go away before a real commit is actually done.
	 *
	 * They do store pointers to file data extents, and those reference
	 * counts still get updated (along with back refs to the log tree).
Y
Yan Zheng 已提交
1275
	 */
1276

1277
	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
1278
			NULL, 0, 0, 0, BTRFS_NESTING_NORMAL);
Y
Yan Zheng 已提交
1279
	if (IS_ERR(leaf)) {
1280
		btrfs_put_root(root);
Y
Yan Zheng 已提交
1281 1282
		return ERR_CAST(leaf);
	}
1283

Y
Yan Zheng 已提交
1284
	root->node = leaf;
1285 1286 1287

	btrfs_mark_buffer_dirty(root->node);
	btrfs_tree_unlock(root->node);
Y
Yan Zheng 已提交
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
	return root;
}

int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *log_root;

	log_root = alloc_log_tree(trans, fs_info);
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);
	WARN_ON(fs_info->log_root_tree);
	fs_info->log_root_tree = log_root;
	return 0;
}

int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root)
{
1307
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
1308 1309 1310
	struct btrfs_root *log_root;
	struct btrfs_inode_item *inode_item;

1311
	log_root = alloc_log_tree(trans, fs_info);
Y
Yan Zheng 已提交
1312 1313 1314 1315 1316 1317 1318
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);

	log_root->last_trans = trans->transid;
	log_root->root_key.offset = root->root_key.objectid;

	inode_item = &log_root->root_item.inode;
1319 1320 1321
	btrfs_set_stack_inode_generation(inode_item, 1);
	btrfs_set_stack_inode_size(inode_item, 3);
	btrfs_set_stack_inode_nlink(inode_item, 1);
1322
	btrfs_set_stack_inode_nbytes(inode_item,
1323
				     fs_info->nodesize);
1324
	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
Y
Yan Zheng 已提交
1325

1326
	btrfs_set_root_node(&log_root->root_item, log_root->node);
Y
Yan Zheng 已提交
1327 1328 1329 1330

	WARN_ON(root->log_root);
	root->log_root = log_root;
	root->log_transid = 0;
1331
	root->log_transid_committed = -1;
1332
	root->last_log_commit = 0;
1333 1334 1335
	return 0;
}

1336 1337 1338
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
					      struct btrfs_path *path,
					      struct btrfs_key *key)
1339 1340 1341
{
	struct btrfs_root *root;
	struct btrfs_fs_info *fs_info = tree_root->fs_info;
1342
	u64 generation;
1343
	int ret;
1344
	int level;
1345

1346
	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1347 1348
	if (!root)
		return ERR_PTR(-ENOMEM);
1349

1350 1351
	ret = btrfs_find_root(tree_root, key, path,
			      &root->root_item, &root->root_key);
1352
	if (ret) {
1353 1354
		if (ret > 0)
			ret = -ENOENT;
1355
		goto fail;
1356
	}
1357

1358
	generation = btrfs_root_generation(&root->root_item);
1359
	level = btrfs_root_level(&root->root_item);
1360 1361
	root->node = read_tree_block(fs_info,
				     btrfs_root_bytenr(&root->root_item),
1362
				     key->objectid, generation, level, NULL);
1363 1364
	if (IS_ERR(root->node)) {
		ret = PTR_ERR(root->node);
1365
		root->node = NULL;
1366
		goto fail;
1367 1368
	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
		ret = -EIO;
1369
		goto fail;
1370
	}
1371
	root->commit_root = btrfs_root_node(root);
1372
	return root;
1373
fail:
1374
	btrfs_put_root(root);
1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
	return ERR_PTR(ret);
}

struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
					struct btrfs_key *key)
{
	struct btrfs_root *root;
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return ERR_PTR(-ENOMEM);
	root = read_tree_root_path(tree_root, path, key);
	btrfs_free_path(path);

	return root;
1391 1392
}

1393 1394 1395 1396 1397 1398
/*
 * Initialize subvolume root in-memory structure
 *
 * @anon_dev:	anonymous device to attach to the root, if zero, allocate new
 */
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1399 1400
{
	int ret;
1401
	unsigned int nofs_flag;
1402

1403 1404 1405 1406 1407 1408 1409 1410
	/*
	 * We might be called under a transaction (e.g. indirect backref
	 * resolution) which could deadlock if it triggers memory reclaim
	 */
	nofs_flag = memalloc_nofs_save();
	ret = btrfs_drew_lock_init(&root->snapshot_lock);
	memalloc_nofs_restore(nofs_flag);
	if (ret)
1411 1412
		goto fail;

1413 1414
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
	    root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
1415
		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1416 1417 1418
		btrfs_check_and_init_root_item(&root->root_item);
	}

1419 1420 1421 1422 1423 1424
	/*
	 * Don't assign anonymous block device to roots that are not exposed to
	 * userspace, the id pool is limited to 1M
	 */
	if (is_fstree(root->root_key.objectid) &&
	    btrfs_root_refs(&root->root_item) > 0) {
1425 1426 1427 1428 1429 1430 1431
		if (!anon_dev) {
			ret = get_anon_bdev(&root->anon_dev);
			if (ret)
				goto fail;
		} else {
			root->anon_dev = anon_dev;
		}
1432
	}
1433 1434

	mutex_lock(&root->objectid_mutex);
1435
	ret = btrfs_init_root_free_objectid(root);
1436 1437
	if (ret) {
		mutex_unlock(&root->objectid_mutex);
L
Liu Bo 已提交
1438
		goto fail;
1439 1440
	}

1441
	ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1442 1443 1444

	mutex_unlock(&root->objectid_mutex);

1445 1446
	return 0;
fail:
D
David Sterba 已提交
1447
	/* The caller is responsible to call btrfs_free_fs_root */
1448 1449 1450
	return ret;
}

1451 1452
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
					       u64 root_id)
1453 1454 1455 1456 1457 1458
{
	struct btrfs_root *root;

	spin_lock(&fs_info->fs_roots_radix_lock);
	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				 (unsigned long)root_id);
1459
	if (root)
1460
		root = btrfs_grab_root(root);
1461 1462 1463 1464
	spin_unlock(&fs_info->fs_roots_radix_lock);
	return root;
}

1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
						u64 objectid)
{
	if (objectid == BTRFS_ROOT_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->tree_root);
	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->extent_root);
	if (objectid == BTRFS_CHUNK_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->chunk_root);
	if (objectid == BTRFS_DEV_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->dev_root);
	if (objectid == BTRFS_CSUM_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->csum_root);
	if (objectid == BTRFS_QUOTA_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->quota_root) ?
			fs_info->quota_root : ERR_PTR(-ENOENT);
	if (objectid == BTRFS_UUID_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->uuid_root) ?
			fs_info->uuid_root : ERR_PTR(-ENOENT);
	if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
		return btrfs_grab_root(fs_info->free_space_root) ?
			fs_info->free_space_root : ERR_PTR(-ENOENT);
	return NULL;
}

1490 1491 1492 1493 1494
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
			 struct btrfs_root *root)
{
	int ret;

1495
	ret = radix_tree_preload(GFP_NOFS);
1496 1497 1498 1499 1500 1501 1502
	if (ret)
		return ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				(unsigned long)root->root_key.objectid,
				root);
1503
	if (ret == 0) {
1504
		btrfs_grab_root(root);
1505
		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1506
	}
1507 1508 1509 1510 1511 1512
	spin_unlock(&fs_info->fs_roots_radix_lock);
	radix_tree_preload_end();

	return ret;
}

J
Josef Bacik 已提交
1513 1514 1515 1516 1517 1518
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
	struct btrfs_root *root;

	while (!list_empty(&fs_info->allocated_roots)) {
J
Josef Bacik 已提交
1519 1520
		char buf[BTRFS_ROOT_NAME_BUF_LEN];

J
Josef Bacik 已提交
1521 1522
		root = list_first_entry(&fs_info->allocated_roots,
					struct btrfs_root, leak_list);
J
Josef Bacik 已提交
1523
		btrfs_err(fs_info, "leaked root %s refcount %d",
1524
			  btrfs_root_name(&root->root_key, buf),
J
Josef Bacik 已提交
1525 1526
			  refcount_read(&root->refs));
		while (refcount_read(&root->refs) > 1)
1527 1528
			btrfs_put_root(root);
		btrfs_put_root(root);
J
Josef Bacik 已提交
1529 1530 1531 1532
	}
#endif
}

1533 1534
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
1535 1536
	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
	percpu_counter_destroy(&fs_info->delalloc_bytes);
1537
	percpu_counter_destroy(&fs_info->ordered_bytes);
1538 1539 1540 1541
	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
	btrfs_free_csum_hash(fs_info);
	btrfs_free_stripe_hash_table(fs_info);
	btrfs_free_ref_cache(fs_info);
1542 1543
	kfree(fs_info->balance_ctl);
	kfree(fs_info->delayed_root);
1544 1545 1546 1547 1548 1549 1550 1551 1552
	btrfs_put_root(fs_info->extent_root);
	btrfs_put_root(fs_info->tree_root);
	btrfs_put_root(fs_info->chunk_root);
	btrfs_put_root(fs_info->dev_root);
	btrfs_put_root(fs_info->csum_root);
	btrfs_put_root(fs_info->quota_root);
	btrfs_put_root(fs_info->uuid_root);
	btrfs_put_root(fs_info->free_space_root);
	btrfs_put_root(fs_info->fs_root);
1553
	btrfs_put_root(fs_info->data_reloc_root);
J
Josef Bacik 已提交
1554
	btrfs_check_leaked_roots(fs_info);
1555
	btrfs_extent_buffer_leak_debug_check(fs_info);
1556 1557 1558 1559 1560 1561
	kfree(fs_info->super_copy);
	kfree(fs_info->super_for_commit);
	kvfree(fs_info);
}


1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582
/*
 * Get an in-memory reference of a root structure.
 *
 * For essential trees like root/extent tree, we grab it from fs_info directly.
 * For subvolume trees, we check the cached filesystem roots first. If not
 * found, then read it from disk and add it to cached fs roots.
 *
 * Caller should release the root by calling btrfs_put_root() after the usage.
 *
 * NOTE: Reloc and log trees can't be read by this function as they share the
 *	 same root objectid.
 *
 * @objectid:	root id
 * @anon_dev:	preallocated anonymous block device number for new roots,
 * 		pass 0 for new allocation.
 * @check_ref:	whether to check root item references, If true, return -ENOENT
 *		for orphan roots
 */
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
					     u64 objectid, dev_t anon_dev,
					     bool check_ref)
1583 1584
{
	struct btrfs_root *root;
1585
	struct btrfs_path *path;
1586
	struct btrfs_key key;
1587 1588
	int ret;

1589 1590 1591
	root = btrfs_get_global_root(fs_info, objectid);
	if (root)
		return root;
1592
again:
D
David Sterba 已提交
1593
	root = btrfs_lookup_fs_root(fs_info, objectid);
1594
	if (root) {
1595 1596
		/* Shouldn't get preallocated anon_dev for cached roots */
		ASSERT(!anon_dev);
1597
		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1598
			btrfs_put_root(root);
1599
			return ERR_PTR(-ENOENT);
1600
		}
1601
		return root;
1602
	}
1603

D
David Sterba 已提交
1604 1605 1606 1607
	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	root = btrfs_read_tree_root(fs_info->tree_root, &key);
1608 1609
	if (IS_ERR(root))
		return root;
1610

1611
	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1612
		ret = -ENOENT;
1613
		goto fail;
1614
	}
1615

1616
	ret = btrfs_init_fs_root(root, anon_dev);
1617 1618
	if (ret)
		goto fail;
1619

1620 1621 1622 1623 1624
	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto fail;
	}
1625 1626
	key.objectid = BTRFS_ORPHAN_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
D
David Sterba 已提交
1627
	key.offset = objectid;
1628 1629

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1630
	btrfs_free_path(path);
1631 1632 1633
	if (ret < 0)
		goto fail;
	if (ret == 0)
1634
		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1635

1636
	ret = btrfs_insert_fs_root(fs_info, root);
1637
	if (ret) {
1638
		btrfs_put_root(root);
1639
		if (ret == -EEXIST)
1640 1641
			goto again;
		goto fail;
1642
	}
1643
	return root;
1644
fail:
1645
	btrfs_put_root(root);
1646
	return ERR_PTR(ret);
1647 1648
}

1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675
/*
 * Get in-memory reference of a root structure
 *
 * @objectid:	tree objectid
 * @check_ref:	if set, verify that the tree exists and the item has at least
 *		one reference
 */
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
				     u64 objectid, bool check_ref)
{
	return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
}

/*
 * Get in-memory reference of a root structure, created as new, optionally pass
 * the anonymous block device id
 *
 * @objectid:	tree objectid
 * @anon_dev:	if zero, allocate a new anonymous block device or use the
 *		parameter value
 */
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
					 u64 objectid, dev_t anon_dev)
{
	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}

1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721
/*
 * btrfs_get_fs_root_commit_root - return a root for the given objectid
 * @fs_info:	the fs_info
 * @objectid:	the objectid we need to lookup
 *
 * This is exclusively used for backref walking, and exists specifically because
 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
 * creation time, which means we may have to read the tree_root in order to look
 * up a fs root that is not in memory.  If the root is not in memory we will
 * read the tree root commit root and look up the fs root from there.  This is a
 * temporary root, it will not be inserted into the radix tree as it doesn't
 * have the most uptodate information, it'll simply be discarded once the
 * backref code is finished using the root.
 */
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
						 struct btrfs_path *path,
						 u64 objectid)
{
	struct btrfs_root *root;
	struct btrfs_key key;

	ASSERT(path->search_commit_root && path->skip_locking);

	/*
	 * This can return -ENOENT if we ask for a root that doesn't exist, but
	 * since this is called via the backref walking code we won't be looking
	 * up a root that doesn't exist, unless there's corruption.  So if root
	 * != NULL just return it.
	 */
	root = btrfs_get_global_root(fs_info, objectid);
	if (root)
		return root;

	root = btrfs_lookup_fs_root(fs_info, objectid);
	if (root)
		return root;

	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	root = read_tree_root_path(fs_info->tree_root, path, &key);
	btrfs_release_path(path);

	return root;
}

1722 1723 1724 1725 1726
/*
 * called by the kthread helper functions to finally call the bio end_io
 * functions.  This is where read checksum verification actually happens
 */
static void end_workqueue_fn(struct btrfs_work *work)
1727 1728
{
	struct bio *bio;
1729
	struct btrfs_end_io_wq *end_io_wq;
1730

1731
	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1732
	bio = end_io_wq->bio;
1733

1734
	bio->bi_status = end_io_wq->status;
1735 1736
	bio->bi_private = end_io_wq->private;
	bio->bi_end_io = end_io_wq->end_io;
1737
	bio_endio(bio);
1738
	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1739 1740
}

1741 1742 1743
static int cleaner_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1744
	struct btrfs_fs_info *fs_info = root->fs_info;
1745
	int again;
1746

1747
	while (1) {
1748
		again = 0;
1749

1750 1751
		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);

1752
		/* Make the cleaner go to sleep early. */
1753
		if (btrfs_need_cleaner_sleep(fs_info))
1754 1755
			goto sleep;

1756 1757 1758 1759
		/*
		 * Do not do anything if we might cause open_ctree() to block
		 * before we have finished mounting the filesystem.
		 */
1760
		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1761 1762
			goto sleep;

1763
		if (!mutex_trylock(&fs_info->cleaner_mutex))
1764 1765
			goto sleep;

1766 1767 1768 1769
		/*
		 * Avoid the problem that we change the status of the fs
		 * during the above check and trylock.
		 */
1770
		if (btrfs_need_cleaner_sleep(fs_info)) {
1771
			mutex_unlock(&fs_info->cleaner_mutex);
1772
			goto sleep;
1773
		}
1774

1775
		btrfs_run_delayed_iputs(fs_info);
1776

1777
		again = btrfs_clean_one_deleted_snapshot(root);
1778
		mutex_unlock(&fs_info->cleaner_mutex);
1779 1780

		/*
1781 1782
		 * The defragger has dealt with the R/O remount and umount,
		 * needn't do anything special here.
1783
		 */
1784
		btrfs_run_defrag_inodes(fs_info);
1785 1786 1787 1788 1789 1790 1791 1792 1793

		/*
		 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
		 * with relocation (btrfs_relocate_chunk) and relocation
		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
		 * after acquiring fs_info->delete_unused_bgs_mutex. So we
		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
		 * unused block groups.
		 */
1794
		btrfs_delete_unused_bgs(fs_info);
1795
sleep:
1796
		clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1797 1798 1799 1800
		if (kthread_should_park())
			kthread_parkme();
		if (kthread_should_stop())
			return 0;
1801
		if (!again) {
1802
			set_current_state(TASK_INTERRUPTIBLE);
1803
			schedule();
1804 1805
			__set_current_state(TASK_RUNNING);
		}
1806
	}
1807 1808 1809 1810 1811
}

static int transaction_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1812
	struct btrfs_fs_info *fs_info = root->fs_info;
1813 1814
	struct btrfs_trans_handle *trans;
	struct btrfs_transaction *cur;
1815
	u64 transid;
1816
	time64_t delta;
1817
	unsigned long delay;
1818
	bool cannot_commit;
1819 1820

	do {
1821
		cannot_commit = false;
1822
		delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1823
		mutex_lock(&fs_info->transaction_kthread_mutex);
1824

1825 1826
		spin_lock(&fs_info->trans_lock);
		cur = fs_info->running_transaction;
1827
		if (!cur) {
1828
			spin_unlock(&fs_info->trans_lock);
1829 1830
			goto sleep;
		}
Y
Yan Zheng 已提交
1831

1832
		delta = ktime_get_seconds() - cur->start_time;
1833
		if (cur->state < TRANS_STATE_COMMIT_START &&
1834
		    delta < fs_info->commit_interval) {
1835
			spin_unlock(&fs_info->trans_lock);
1836 1837 1838
			delay -= msecs_to_jiffies((delta - 1) * 1000);
			delay = min(delay,
				    msecs_to_jiffies(fs_info->commit_interval * 1000));
1839 1840
			goto sleep;
		}
1841
		transid = cur->transid;
1842
		spin_unlock(&fs_info->trans_lock);
1843

1844
		/* If the file system is aborted, this will always fail. */
1845
		trans = btrfs_attach_transaction(root);
1846
		if (IS_ERR(trans)) {
1847 1848
			if (PTR_ERR(trans) != -ENOENT)
				cannot_commit = true;
1849
			goto sleep;
1850
		}
1851
		if (transid == trans->transid) {
1852
			btrfs_commit_transaction(trans);
1853
		} else {
1854
			btrfs_end_transaction(trans);
1855
		}
1856
sleep:
1857 1858
		wake_up_process(fs_info->cleaner_kthread);
		mutex_unlock(&fs_info->transaction_kthread_mutex);
1859

J
Josef Bacik 已提交
1860
		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1861
				      &fs_info->fs_state)))
1862
			btrfs_cleanup_transaction(fs_info);
1863
		if (!kthread_should_stop() &&
1864
				(!btrfs_transaction_blocked(fs_info) ||
1865
				 cannot_commit))
1866
			schedule_timeout_interruptible(delay);
1867 1868 1869 1870
	} while (!kthread_should_stop());
	return 0;
}

C
Chris Mason 已提交
1871
/*
1872 1873 1874
 * This will find the highest generation in the array of root backups.  The
 * index of the highest array is returned, or -EINVAL if we can't find
 * anything.
C
Chris Mason 已提交
1875 1876 1877 1878 1879
 *
 * We check to make sure the array is valid by comparing the
 * generation of the latest  root in the array with the generation
 * in the super block.  If they don't match we pitch it.
 */
1880
static int find_newest_super_backup(struct btrfs_fs_info *info)
C
Chris Mason 已提交
1881
{
1882
	const u64 newest_gen = btrfs_super_generation(info->super_copy);
C
Chris Mason 已提交
1883 1884 1885 1886 1887 1888 1889 1890
	u64 cur;
	struct btrfs_root_backup *root_backup;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		root_backup = info->super_copy->super_roots + i;
		cur = btrfs_backup_tree_root_gen(root_backup);
		if (cur == newest_gen)
1891
			return i;
C
Chris Mason 已提交
1892 1893
	}

1894
	return -EINVAL;
C
Chris Mason 已提交
1895 1896 1897 1898 1899 1900 1901 1902 1903
}

/*
 * copy all the root pointers into the super backup array.
 * this will bump the backup pointer by one when it is
 * done
 */
static void backup_super_roots(struct btrfs_fs_info *info)
{
1904
	const int next_backup = info->backup_root_index;
C
Chris Mason 已提交
1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935
	struct btrfs_root_backup *root_backup;

	root_backup = info->super_for_commit->super_roots + next_backup;

	/*
	 * make sure all of our padding and empty slots get zero filled
	 * regardless of which ones we use today
	 */
	memset(root_backup, 0, sizeof(*root_backup));

	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;

	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
	btrfs_set_backup_tree_root_gen(root_backup,
			       btrfs_header_generation(info->tree_root->node));

	btrfs_set_backup_tree_root_level(root_backup,
			       btrfs_header_level(info->tree_root->node));

	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
	btrfs_set_backup_chunk_root_gen(root_backup,
			       btrfs_header_generation(info->chunk_root->node));
	btrfs_set_backup_chunk_root_level(root_backup,
			       btrfs_header_level(info->chunk_root->node));

	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
	btrfs_set_backup_extent_root_gen(root_backup,
			       btrfs_header_generation(info->extent_root->node));
	btrfs_set_backup_extent_root_level(root_backup,
			       btrfs_header_level(info->extent_root->node));

1936 1937 1938 1939 1940 1941 1942 1943
	/*
	 * we might commit during log recovery, which happens before we set
	 * the fs_root.  Make sure it is valid before we fill it in.
	 */
	if (info->fs_root && info->fs_root->node) {
		btrfs_set_backup_fs_root(root_backup,
					 info->fs_root->node->start);
		btrfs_set_backup_fs_root_gen(root_backup,
C
Chris Mason 已提交
1944
			       btrfs_header_generation(info->fs_root->node));
1945
		btrfs_set_backup_fs_root_level(root_backup,
C
Chris Mason 已提交
1946
			       btrfs_header_level(info->fs_root->node));
1947
	}
C
Chris Mason 已提交
1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976

	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
	btrfs_set_backup_dev_root_gen(root_backup,
			       btrfs_header_generation(info->dev_root->node));
	btrfs_set_backup_dev_root_level(root_backup,
				       btrfs_header_level(info->dev_root->node));

	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
	btrfs_set_backup_csum_root_gen(root_backup,
			       btrfs_header_generation(info->csum_root->node));
	btrfs_set_backup_csum_root_level(root_backup,
			       btrfs_header_level(info->csum_root->node));

	btrfs_set_backup_total_bytes(root_backup,
			     btrfs_super_total_bytes(info->super_copy));
	btrfs_set_backup_bytes_used(root_backup,
			     btrfs_super_bytes_used(info->super_copy));
	btrfs_set_backup_num_devices(root_backup,
			     btrfs_super_num_devices(info->super_copy));

	/*
	 * if we don't copy this out to the super_copy, it won't get remembered
	 * for the next commit
	 */
	memcpy(&info->super_copy->super_roots,
	       &info->super_for_commit->super_roots,
	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}

N
Nikolay Borisov 已提交
1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020
/*
 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
 *
 * fs_info - filesystem whose backup roots need to be read
 * priority - priority of backup root required
 *
 * Returns backup root index on success and -EINVAL otherwise.
 */
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
	int backup_index = find_newest_super_backup(fs_info);
	struct btrfs_super_block *super = fs_info->super_copy;
	struct btrfs_root_backup *root_backup;

	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
		if (priority == 0)
			return backup_index;

		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
	} else {
		return -EINVAL;
	}

	root_backup = super->super_roots + backup_index;

	btrfs_set_super_generation(super,
				   btrfs_backup_tree_root_gen(root_backup));
	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
	btrfs_set_super_root_level(super,
				   btrfs_backup_tree_root_level(root_backup));
	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));

	/*
	 * Fixme: the total bytes and num_devices need to match or we should
	 * need a fsck
	 */
	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));

	return backup_index;
}

L
Liu Bo 已提交
2021 2022 2023
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
2024
	btrfs_destroy_workqueue(fs_info->fixup_workers);
2025
	btrfs_destroy_workqueue(fs_info->delalloc_workers);
2026
	btrfs_destroy_workqueue(fs_info->workers);
2027 2028
	btrfs_destroy_workqueue(fs_info->endio_workers);
	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2029
	btrfs_destroy_workqueue(fs_info->rmw_workers);
2030 2031
	btrfs_destroy_workqueue(fs_info->endio_write_workers);
	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2032
	btrfs_destroy_workqueue(fs_info->delayed_workers);
2033
	btrfs_destroy_workqueue(fs_info->caching_workers);
2034
	btrfs_destroy_workqueue(fs_info->readahead_workers);
2035
	btrfs_destroy_workqueue(fs_info->flush_workers);
2036
	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2037 2038
	if (fs_info->discard_ctl.discard_workers)
		destroy_workqueue(fs_info->discard_ctl.discard_workers);
2039 2040 2041 2042 2043 2044 2045
	/*
	 * Now that all other work queues are destroyed, we can safely destroy
	 * the queues used for metadata I/O, since tasks from those other work
	 * queues can do metadata I/O operations.
	 */
	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
L
Liu Bo 已提交
2046 2047
}

2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
static void free_root_extent_buffers(struct btrfs_root *root)
{
	if (root) {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
		root->node = NULL;
		root->commit_root = NULL;
	}
}

C
Chris Mason 已提交
2058
/* helper to cleanup tree roots */
2059
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
C
Chris Mason 已提交
2060
{
2061
	free_root_extent_buffers(info->tree_root);
2062

2063 2064 2065 2066 2067
	free_root_extent_buffers(info->dev_root);
	free_root_extent_buffers(info->extent_root);
	free_root_extent_buffers(info->csum_root);
	free_root_extent_buffers(info->quota_root);
	free_root_extent_buffers(info->uuid_root);
2068
	free_root_extent_buffers(info->fs_root);
2069
	free_root_extent_buffers(info->data_reloc_root);
2070
	if (free_chunk_root)
2071
		free_root_extent_buffers(info->chunk_root);
2072
	free_root_extent_buffers(info->free_space_root);
C
Chris Mason 已提交
2073 2074
}

2075 2076 2077 2078 2079 2080 2081
void btrfs_put_root(struct btrfs_root *root)
{
	if (!root)
		return;

	if (refcount_dec_and_test(&root->refs)) {
		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2082
		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
2083 2084 2085
		if (root->anon_dev)
			free_anon_bdev(root->anon_dev);
		btrfs_drew_lock_destroy(&root->snapshot_lock);
2086
		free_root_extent_buffers(root);
2087 2088 2089 2090 2091 2092 2093 2094 2095
#ifdef CONFIG_BTRFS_DEBUG
		spin_lock(&root->fs_info->fs_roots_radix_lock);
		list_del_init(&root->leak_list);
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
#endif
		kfree(root);
	}
}

2096
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2097 2098 2099 2100 2101 2102 2103 2104 2105 2106
{
	int ret;
	struct btrfs_root *gang[8];
	int i;

	while (!list_empty(&fs_info->dead_roots)) {
		gang[0] = list_entry(fs_info->dead_roots.next,
				     struct btrfs_root, root_list);
		list_del(&gang[0]->root_list);

2107
		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
2108
			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2109
		btrfs_put_root(gang[0]);
2110 2111 2112 2113 2114 2115 2116 2117 2118
	}

	while (1) {
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, 0,
					     ARRAY_SIZE(gang));
		if (!ret)
			break;
		for (i = 0; i < ret; i++)
2119
			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2120 2121
	}
}
C
Chris Mason 已提交
2122

2123 2124 2125 2126 2127 2128 2129 2130
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->scrub_lock);
	atomic_set(&fs_info->scrubs_running, 0);
	atomic_set(&fs_info->scrub_pause_req, 0);
	atomic_set(&fs_info->scrubs_paused, 0);
	atomic_set(&fs_info->scrub_cancel_req, 0);
	init_waitqueue_head(&fs_info->scrub_pause_wait);
2131
	refcount_set(&fs_info->scrub_workers_refcnt, 0);
2132 2133
}

2134 2135 2136 2137 2138 2139 2140 2141 2142 2143
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->balance_lock);
	mutex_init(&fs_info->balance_mutex);
	atomic_set(&fs_info->balance_pause_req, 0);
	atomic_set(&fs_info->balance_cancel_req, 0);
	fs_info->balance_ctl = NULL;
	init_waitqueue_head(&fs_info->balance_wait_q);
}

2144
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2145
{
2146 2147 2148 2149
	struct inode *inode = fs_info->btree_inode;

	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
	set_nlink(inode, 1);
2150 2151 2152 2153 2154
	/*
	 * we set the i_size on the btree inode to the max possible int.
	 * the real end of the address space is determined by all of
	 * the devices in the system
	 */
2155 2156
	inode->i_size = OFFSET_MAX;
	inode->i_mapping->a_ops = &btree_aops;
2157

2158
	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2159
	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
2160
			    IO_TREE_BTREE_INODE_IO, inode);
2161
	BTRFS_I(inode)->io_tree.track_uptodate = false;
2162
	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2163

2164
	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
2165 2166 2167
	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
	btrfs_insert_inode_hash(inode);
2168 2169
}

2170 2171 2172
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2173
	init_rwsem(&fs_info->dev_replace.rwsem);
2174
	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2175 2176
}

2177 2178 2179 2180 2181 2182 2183 2184
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->qgroup_lock);
	mutex_init(&fs_info->qgroup_ioctl_lock);
	fs_info->qgroup_tree = RB_ROOT;
	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
	fs_info->qgroup_seq = 1;
	fs_info->qgroup_ulist = NULL;
2185
	fs_info->qgroup_rescan_running = false;
2186 2187 2188
	mutex_init(&fs_info->qgroup_rescan_lock);
}

2189 2190 2191
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
		struct btrfs_fs_devices *fs_devices)
{
2192
	u32 max_active = fs_info->thread_pool_size;
2193
	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2194 2195

	fs_info->workers =
2196 2197
		btrfs_alloc_workqueue(fs_info, "worker",
				      flags | WQ_HIGHPRI, max_active, 16);
2198 2199

	fs_info->delalloc_workers =
2200 2201
		btrfs_alloc_workqueue(fs_info, "delalloc",
				      flags, max_active, 2);
2202 2203

	fs_info->flush_workers =
2204 2205
		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				      flags, max_active, 0);
2206 2207

	fs_info->caching_workers =
2208
		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2209 2210

	fs_info->fixup_workers =
2211
		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2212 2213 2214 2215 2216 2217

	/*
	 * endios are largely parallel and should have a very
	 * low idle thresh
	 */
	fs_info->endio_workers =
2218
		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2219
	fs_info->endio_meta_workers =
2220 2221
		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
				      max_active, 4);
2222
	fs_info->endio_meta_write_workers =
2223 2224
		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
				      max_active, 2);
2225
	fs_info->endio_raid56_workers =
2226 2227
		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
				      max_active, 4);
2228
	fs_info->rmw_workers =
2229
		btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2230
	fs_info->endio_write_workers =
2231 2232
		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				      max_active, 2);
2233
	fs_info->endio_freespace_worker =
2234 2235
		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				      max_active, 0);
2236
	fs_info->delayed_workers =
2237 2238
		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				      max_active, 0);
2239
	fs_info->readahead_workers =
2240 2241
		btrfs_alloc_workqueue(fs_info, "readahead", flags,
				      max_active, 2);
2242
	fs_info->qgroup_rescan_workers =
2243
		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2244 2245
	fs_info->discard_ctl.discard_workers =
		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2246 2247

	if (!(fs_info->workers && fs_info->delalloc_workers &&
2248
	      fs_info->flush_workers &&
2249 2250 2251 2252 2253 2254
	      fs_info->endio_workers && fs_info->endio_meta_workers &&
	      fs_info->endio_meta_write_workers &&
	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
	      fs_info->caching_workers && fs_info->readahead_workers &&
	      fs_info->fixup_workers && fs_info->delayed_workers &&
2255 2256
	      fs_info->qgroup_rescan_workers &&
	      fs_info->discard_ctl.discard_workers)) {
2257 2258 2259 2260 2261 2262
		return -ENOMEM;
	}

	return 0;
}

2263 2264 2265
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
	struct crypto_shash *csum_shash;
2266
	const char *csum_driver = btrfs_super_csum_driver(csum_type);
2267

2268
	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2269 2270 2271

	if (IS_ERR(csum_shash)) {
		btrfs_err(fs_info, "error allocating %s hash for checksum",
2272
			  csum_driver);
2273 2274 2275 2276 2277 2278 2279 2280
		return PTR_ERR(csum_shash);
	}

	fs_info->csum_shash = csum_shash;

	return 0;
}

2281 2282 2283 2284 2285 2286 2287
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
			    struct btrfs_fs_devices *fs_devices)
{
	int ret;
	struct btrfs_root *log_tree_root;
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	u64 bytenr = btrfs_super_log_root(disk_super);
2288
	int level = btrfs_super_log_root_level(disk_super);
2289 2290

	if (fs_devices->rw_devices == 0) {
2291
		btrfs_warn(fs_info, "log replay required on RO media");
2292 2293 2294
		return -EIO;
	}

2295 2296
	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
					 GFP_KERNEL);
2297 2298 2299
	if (!log_tree_root)
		return -ENOMEM;

2300
	log_tree_root->node = read_tree_block(fs_info, bytenr,
2301 2302 2303
					      BTRFS_TREE_LOG_OBJECTID,
					      fs_info->generation + 1, level,
					      NULL);
2304
	if (IS_ERR(log_tree_root->node)) {
2305
		btrfs_warn(fs_info, "failed to read log tree");
2306
		ret = PTR_ERR(log_tree_root->node);
2307
		log_tree_root->node = NULL;
2308
		btrfs_put_root(log_tree_root);
2309
		return ret;
2310
	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
2311
		btrfs_err(fs_info, "failed to read log tree");
2312
		btrfs_put_root(log_tree_root);
2313 2314 2315 2316 2317
		return -EIO;
	}
	/* returns with log_tree_root freed on success */
	ret = btrfs_recover_log_trees(log_tree_root);
	if (ret) {
2318 2319
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to recover log tree");
2320
		btrfs_put_root(log_tree_root);
2321 2322 2323
		return ret;
	}

2324
	if (sb_rdonly(fs_info->sb)) {
2325
		ret = btrfs_commit_super(fs_info);
2326 2327 2328 2329 2330 2331 2332
		if (ret)
			return ret;
	}

	return 0;
}

2333
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2334
{
2335
	struct btrfs_root *tree_root = fs_info->tree_root;
2336
	struct btrfs_root *root;
2337 2338 2339
	struct btrfs_key location;
	int ret;

2340 2341
	BUG_ON(!fs_info->tree_root);

2342 2343 2344 2345
	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
	location.type = BTRFS_ROOT_ITEM_KEY;
	location.offset = 0;

2346
	root = btrfs_read_tree_root(tree_root, &location);
2347
	if (IS_ERR(root)) {
2348 2349 2350 2351 2352 2353 2354
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			goto out;
		}
	} else {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->extent_root = root;
2355
	}
2356 2357

	location.objectid = BTRFS_DEV_TREE_OBJECTID;
2358
	root = btrfs_read_tree_root(tree_root, &location);
2359
	if (IS_ERR(root)) {
2360 2361 2362 2363 2364 2365 2366 2367
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			goto out;
		}
	} else {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->dev_root = root;
		btrfs_init_devices_late(fs_info);
2368
	}
2369

2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381
	/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
	if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
		location.objectid = BTRFS_CSUM_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
		if (IS_ERR(root)) {
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
				ret = PTR_ERR(root);
				goto out;
			}
		} else {
			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
			fs_info->csum_root = root;
2382
		}
2383
	}
2384

2385 2386 2387 2388
	/*
	 * This tree can share blocks with some other fs tree during relocation
	 * and we need a proper setup by btrfs_get_fs_root
	 */
D
David Sterba 已提交
2389 2390
	root = btrfs_get_fs_root(tree_root->fs_info,
				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2391
	if (IS_ERR(root)) {
2392 2393 2394 2395 2396 2397 2398
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			goto out;
		}
	} else {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->data_reloc_root = root;
2399 2400
	}

2401
	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2402 2403 2404
	root = btrfs_read_tree_root(tree_root, &location);
	if (!IS_ERR(root)) {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2405
		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2406
		fs_info->quota_root = root;
2407 2408 2409
	}

	location.objectid = BTRFS_UUID_TREE_OBJECTID;
2410 2411
	root = btrfs_read_tree_root(tree_root, &location);
	if (IS_ERR(root)) {
2412 2413 2414 2415 2416
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			if (ret != -ENOENT)
				goto out;
		}
2417
	} else {
2418 2419
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->uuid_root = root;
2420 2421
	}

2422 2423 2424
	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
2425
		if (IS_ERR(root)) {
2426 2427 2428 2429 2430 2431 2432
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
				ret = PTR_ERR(root);
				goto out;
			}
		}  else {
			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
			fs_info->free_space_root = root;
2433
		}
2434 2435
	}

2436
	return 0;
2437 2438 2439 2440
out:
	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
		   location.objectid, ret);
	return ret;
2441 2442
}

2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454
/*
 * Real super block validation
 * NOTE: super csum type and incompat features will not be checked here.
 *
 * @sb:		super block to check
 * @mirror_num:	the super block number to check its bytenr:
 * 		0	the primary (1st) sb
 * 		1, 2	2nd and 3rd backup copy
 * 	       -1	skip bytenr check
 */
static int validate_super(struct btrfs_fs_info *fs_info,
			    struct btrfs_super_block *sb, int mirror_num)
2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493
{
	u64 nodesize = btrfs_super_nodesize(sb);
	u64 sectorsize = btrfs_super_sectorsize(sb);
	int ret = 0;

	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
		btrfs_err(fs_info, "no valid FS found");
		ret = -EINVAL;
	}
	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
		ret = -EINVAL;
	}
	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "log_root level too big: %d >= %d",
				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}

	/*
	 * Check sectorsize and nodesize first, other check will need it.
	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
	 */
	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
		ret = -EINVAL;
	}
2494 2495 2496 2497 2498 2499 2500 2501 2502

	/*
	 * For 4K page size, we only support 4K sector size.
	 * For 64K page size, we support read-write for 64K sector size, and
	 * read-only for 4K sector size.
	 */
	if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
	    (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
				     sectorsize != SZ_64K))) {
2503
		btrfs_err(fs_info,
2504
			"sectorsize %llu not yet supported for page size %lu",
2505 2506 2507
			sectorsize, PAGE_SIZE);
		ret = -EINVAL;
	}
2508

2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536
	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
		ret = -EINVAL;
	}
	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
			  le32_to_cpu(sb->__unused_leafsize), nodesize);
		ret = -EINVAL;
	}

	/* Root alignment check */
	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
			   btrfs_super_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
			   btrfs_super_chunk_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "log_root block unaligned: %llu",
			   btrfs_super_log_root(sb));
		ret = -EINVAL;
	}

2537
	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2538
		   BTRFS_FSID_SIZE) != 0) {
2539
		btrfs_err(fs_info,
2540
			"dev_item UUID does not match metadata fsid: %pU != %pU",
2541
			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566
		ret = -EINVAL;
	}

	/*
	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
	 * done later
	 */
	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
		btrfs_err(fs_info, "bytes_used is too small %llu",
			  btrfs_super_bytes_used(sb));
		ret = -EINVAL;
	}
	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
		btrfs_err(fs_info, "invalid stripesize %u",
			  btrfs_super_stripesize(sb));
		ret = -EINVAL;
	}
	if (btrfs_super_num_devices(sb) > (1UL << 31))
		btrfs_warn(fs_info, "suspicious number of devices: %llu",
			   btrfs_super_num_devices(sb));
	if (btrfs_super_num_devices(sb) == 0) {
		btrfs_err(fs_info, "number of devices is 0");
		ret = -EINVAL;
	}

2567 2568
	if (mirror_num >= 0 &&
	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
		btrfs_err(fs_info, "super offset mismatch %llu != %u",
			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
		ret = -EINVAL;
	}

	/*
	 * Obvious sys_chunk_array corruptions, it must hold at least one key
	 * and one chunk
	 */
	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
		btrfs_err(fs_info, "system chunk array too big %u > %u",
			  btrfs_super_sys_array_size(sb),
			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
		ret = -EINVAL;
	}
	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
			+ sizeof(struct btrfs_chunk)) {
		btrfs_err(fs_info, "system chunk array too small %u < %zu",
			  btrfs_super_sys_array_size(sb),
			  sizeof(struct btrfs_disk_key)
			  + sizeof(struct btrfs_chunk));
		ret = -EINVAL;
	}

	/*
	 * The generation is a global counter, we'll trust it more than the others
	 * but it's still possible that it's the one that's wrong.
	 */
	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
		btrfs_warn(fs_info,
			"suspicious: generation < chunk_root_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_chunk_root_generation(sb));
	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
	    && btrfs_super_cache_generation(sb) != (u64)-1)
		btrfs_warn(fs_info,
			"suspicious: generation < cache_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_cache_generation(sb));

	return ret;
}

2612 2613 2614 2615 2616 2617 2618 2619 2620 2621
/*
 * Validation of super block at mount time.
 * Some checks already done early at mount time, like csum type and incompat
 * flags will be skipped.
 */
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
	return validate_super(fs_info, fs_info->super_copy, 0);
}

2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635
/*
 * Validation of super block at write time.
 * Some checks like bytenr check will be skipped as their values will be
 * overwritten soon.
 * Extra checks like csum type and incompat flags will be done here.
 */
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
				      struct btrfs_super_block *sb)
{
	int ret;

	ret = validate_super(fs_info, sb, -1);
	if (ret < 0)
		goto out;
2636
	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656
		ret = -EUCLEAN;
		btrfs_err(fs_info, "invalid csum type, has %u want %u",
			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
		goto out;
	}
	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
		ret = -EUCLEAN;
		btrfs_err(fs_info,
		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
			  btrfs_super_incompat_flags(sb),
			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
		goto out;
	}
out:
	if (ret < 0)
		btrfs_err(fs_info,
		"super block corruption detected before writing it to disk");
	return ret;
}

2657
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2658
{
2659
	int backup_index = find_newest_super_backup(fs_info);
2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689
	struct btrfs_super_block *sb = fs_info->super_copy;
	struct btrfs_root *tree_root = fs_info->tree_root;
	bool handle_error = false;
	int ret = 0;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		u64 generation;
		int level;

		if (handle_error) {
			if (!IS_ERR(tree_root->node))
				free_extent_buffer(tree_root->node);
			tree_root->node = NULL;

			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				break;

			free_root_pointers(fs_info, 0);

			/*
			 * Don't use the log in recovery mode, it won't be
			 * valid
			 */
			btrfs_set_super_log_root(sb, 0);

			/* We can't trust the free space cache either */
			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);

			ret = read_backup_root(fs_info, i);
2690
			backup_index = ret;
2691 2692 2693 2694 2695 2696
			if (ret < 0)
				return ret;
		}
		generation = btrfs_super_generation(sb);
		level = btrfs_super_root_level(sb);
		tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
2697
						  BTRFS_ROOT_TREE_OBJECTID,
2698
						  generation, level, NULL);
2699
		if (IS_ERR(tree_root->node)) {
2700
			handle_error = true;
2701 2702 2703 2704
			ret = PTR_ERR(tree_root->node);
			tree_root->node = NULL;
			btrfs_warn(fs_info, "couldn't read tree root");
			continue;
2705

2706 2707 2708 2709
		} else if (!extent_buffer_uptodate(tree_root->node)) {
			handle_error = true;
			ret = -EIO;
			btrfs_warn(fs_info, "error while reading tree root");
2710 2711 2712 2713 2714 2715 2716
			continue;
		}

		btrfs_set_root_node(&tree_root->root_item, tree_root->node);
		tree_root->commit_root = btrfs_root_node(tree_root);
		btrfs_set_root_refs(&tree_root->root_item, 1);

2717 2718 2719 2720
		/*
		 * No need to hold btrfs_root::objectid_mutex since the fs
		 * hasn't been fully initialised and we are the only user
		 */
2721
		ret = btrfs_init_root_free_objectid(tree_root);
2722 2723 2724 2725 2726
		if (ret < 0) {
			handle_error = true;
			continue;
		}

2727
		ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2728 2729 2730 2731 2732 2733 2734 2735 2736 2737

		ret = btrfs_read_roots(fs_info);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		/* All successful */
		fs_info->generation = generation;
		fs_info->last_trans_committed = generation;
2738 2739 2740 2741 2742 2743 2744 2745

		/* Always begin writing backup roots after the one being used */
		if (backup_index < 0) {
			fs_info->backup_root_index = 0;
		} else {
			fs_info->backup_root_index = backup_index + 1;
			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
		}
2746 2747 2748 2749 2750 2751
		break;
	}

	return ret;
}

2752
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2753
{
2754
	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2755
	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
C
Chris Mason 已提交
2756
	INIT_LIST_HEAD(&fs_info->trans_list);
2757
	INIT_LIST_HEAD(&fs_info->dead_roots);
Y
Yan, Zheng 已提交
2758
	INIT_LIST_HEAD(&fs_info->delayed_iputs);
2759
	INIT_LIST_HEAD(&fs_info->delalloc_roots);
2760
	INIT_LIST_HEAD(&fs_info->caching_block_groups);
2761
	spin_lock_init(&fs_info->delalloc_root_lock);
J
Josef Bacik 已提交
2762
	spin_lock_init(&fs_info->trans_lock);
2763
	spin_lock_init(&fs_info->fs_roots_radix_lock);
Y
Yan, Zheng 已提交
2764
	spin_lock_init(&fs_info->delayed_iput_lock);
C
Chris Mason 已提交
2765
	spin_lock_init(&fs_info->defrag_inodes_lock);
2766
	spin_lock_init(&fs_info->super_lock);
2767
	spin_lock_init(&fs_info->buffer_lock);
2768
	spin_lock_init(&fs_info->unused_bgs_lock);
J
Jan Schmidt 已提交
2769
	rwlock_init(&fs_info->tree_mod_log_lock);
2770
	mutex_init(&fs_info->unused_bg_unpin_mutex);
2771
	mutex_init(&fs_info->delete_unused_bgs_mutex);
C
Chris Mason 已提交
2772
	mutex_init(&fs_info->reloc_mutex);
2773
	mutex_init(&fs_info->delalloc_root_mutex);
2774
	mutex_init(&fs_info->zoned_meta_io_lock);
2775
	seqlock_init(&fs_info->profiles_lock);
2776

2777
	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2778
	INIT_LIST_HEAD(&fs_info->space_info);
J
Jan Schmidt 已提交
2779
	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2780
	INIT_LIST_HEAD(&fs_info->unused_bgs);
J
Josef Bacik 已提交
2781 2782
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&fs_info->allocated_roots);
2783 2784
	INIT_LIST_HEAD(&fs_info->allocated_ebs);
	spin_lock_init(&fs_info->eb_leak_lock);
J
Josef Bacik 已提交
2785
#endif
2786
	extent_map_tree_init(&fs_info->mapping_tree);
2787 2788 2789 2790 2791 2792 2793
	btrfs_init_block_rsv(&fs_info->global_block_rsv,
			     BTRFS_BLOCK_RSV_GLOBAL);
	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
			     BTRFS_BLOCK_RSV_DELOPS);
J
Josef Bacik 已提交
2794 2795 2796
	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
			     BTRFS_BLOCK_RSV_DELREFS);

2797
	atomic_set(&fs_info->async_delalloc_pages, 0);
C
Chris Mason 已提交
2798
	atomic_set(&fs_info->defrag_running, 0);
Z
Zhao Lei 已提交
2799
	atomic_set(&fs_info->reada_works_cnt, 0);
2800
	atomic_set(&fs_info->nr_delayed_iputs, 0);
2801
	atomic64_set(&fs_info->tree_mod_seq, 0);
2802
	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
J
Josef Bacik 已提交
2803
	fs_info->metadata_ratio = 0;
C
Chris Mason 已提交
2804
	fs_info->defrag_inodes = RB_ROOT;
2805
	atomic64_set(&fs_info->free_chunk_space, 0);
J
Jan Schmidt 已提交
2806
	fs_info->tree_mod_log = RB_ROOT;
2807
	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2808
	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2809
	/* readahead state */
2810
	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2811
	spin_lock_init(&fs_info->reada_lock);
J
Josef Bacik 已提交
2812
	btrfs_init_ref_verify(fs_info);
C
Chris Mason 已提交
2813

2814 2815
	fs_info->thread_pool_size = min_t(unsigned long,
					  num_online_cpus() + 2, 8);
2816

2817 2818
	INIT_LIST_HEAD(&fs_info->ordered_roots);
	spin_lock_init(&fs_info->ordered_root_lock);
2819

2820
	btrfs_init_scrub(fs_info);
2821 2822 2823
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
	fs_info->check_integrity_print_mask = 0;
#endif
2824
	btrfs_init_balance(fs_info);
2825
	btrfs_init_async_reclaim_work(fs_info);
A
Arne Jansen 已提交
2826

J
Josef Bacik 已提交
2827
	spin_lock_init(&fs_info->block_group_cache_lock);
2828
	fs_info->block_group_cache_tree = RB_ROOT;
2829
	fs_info->first_logical_byte = (u64)-1;
J
Josef Bacik 已提交
2830

2831 2832
	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
			    IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
2833
	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
C
Chris Mason 已提交
2834

2835
	mutex_init(&fs_info->ordered_operations_mutex);
2836
	mutex_init(&fs_info->tree_log_mutex);
2837
	mutex_init(&fs_info->chunk_mutex);
2838 2839
	mutex_init(&fs_info->transaction_kthread_mutex);
	mutex_init(&fs_info->cleaner_mutex);
2840
	mutex_init(&fs_info->ro_block_group_mutex);
2841
	init_rwsem(&fs_info->commit_root_sem);
2842
	init_rwsem(&fs_info->cleanup_work_sem);
2843
	init_rwsem(&fs_info->subvol_sem);
S
Stefan Behrens 已提交
2844
	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2845

2846
	btrfs_init_dev_replace_locks(fs_info);
2847
	btrfs_init_qgroup(fs_info);
2848
	btrfs_discard_init(fs_info);
2849

2850 2851 2852
	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

2853
	init_waitqueue_head(&fs_info->transaction_throttle);
2854
	init_waitqueue_head(&fs_info->transaction_wait);
S
Sage Weil 已提交
2855
	init_waitqueue_head(&fs_info->transaction_blocked_wait);
2856
	init_waitqueue_head(&fs_info->async_submit_wait);
2857
	init_waitqueue_head(&fs_info->delayed_iputs_wait);
2858

2859 2860 2861
	/* Usable values until the real ones are cached from the superblock */
	fs_info->nodesize = 4096;
	fs_info->sectorsize = 4096;
2862
	fs_info->sectorsize_bits = ilog2(4096);
2863 2864
	fs_info->stripesize = 4096;

2865 2866 2867
	spin_lock_init(&fs_info->swapfile_pins_lock);
	fs_info->swapfile_pins = RB_ROOT;

2868
	fs_info->send_in_progress = 0;
2869 2870 2871 2872 2873 2874 2875 2876 2877
}

static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
{
	int ret;

	fs_info->sb = sb;
	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2878

2879
	ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2880
	if (ret)
J
Josef Bacik 已提交
2881
		return ret;
2882 2883 2884

	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
	if (ret)
J
Josef Bacik 已提交
2885
		return ret;
2886 2887 2888 2889 2890 2891

	fs_info->dirty_metadata_batch = PAGE_SIZE *
					(1 + ilog2(nr_cpu_ids));

	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
	if (ret)
J
Josef Bacik 已提交
2892
		return ret;
2893 2894 2895 2896

	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
			GFP_KERNEL);
	if (ret)
J
Josef Bacik 已提交
2897
		return ret;
2898 2899 2900

	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
					GFP_KERNEL);
J
Josef Bacik 已提交
2901 2902
	if (!fs_info->delayed_root)
		return -ENOMEM;
2903 2904
	btrfs_init_delayed_root(fs_info->delayed_root);

2905 2906 2907
	if (sb_rdonly(sb))
		set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);

J
Josef Bacik 已提交
2908
	return btrfs_alloc_stripe_hash_table(fs_info);
2909 2910
}

2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922
static int btrfs_uuid_rescan_kthread(void *data)
{
	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info);
	if (ret < 0) {
2923 2924 2925
		if (ret != -EINTR)
			btrfs_warn(fs_info, "iterating uuid_tree failed %d",
				   ret);
2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

2948 2949 2950 2951 2952 2953 2954 2955
/*
 * Some options only have meaning at mount time and shouldn't persist across
 * remounts, or be displayed. Clear these at the end of mount and remount
 * code paths.
 */
void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
2956
	btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
2957 2958
}

2959 2960 2961 2962 2963 2964 2965
/*
 * Mounting logic specific to read-write file systems. Shared by open_ctree
 * and btrfs_remount when remounting from read-only to read-write.
 */
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
{
	int ret;
2966
	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986
	bool clear_free_space_tree = false;

	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		clear_free_space_tree = true;
	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
		btrfs_warn(fs_info, "free space tree is invalid");
		clear_free_space_tree = true;
	}

	if (clear_free_space_tree) {
		btrfs_info(fs_info, "clearing free space tree");
		ret = btrfs_clear_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to clear free space tree: %d", ret);
			goto out;
		}
	}
2987 2988 2989 2990 2991

	ret = btrfs_cleanup_fs_roots(fs_info);
	if (ret)
		goto out;

2992 2993 2994 2995 2996 2997 2998 2999
	down_read(&fs_info->cleanup_work_sem);
	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
		up_read(&fs_info->cleanup_work_sem);
		goto out;
	}
	up_read(&fs_info->cleanup_work_sem);

3000 3001 3002 3003 3004 3005 3006 3007
	mutex_lock(&fs_info->cleaner_mutex);
	ret = btrfs_recover_relocation(fs_info->tree_root);
	mutex_unlock(&fs_info->cleaner_mutex);
	if (ret < 0) {
		btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
		goto out;
	}

3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018
	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		btrfs_info(fs_info, "creating free space tree");
		ret = btrfs_create_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				"failed to create free space tree: %d", ret);
			goto out;
		}
	}

3019 3020 3021 3022 3023 3024
	if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
		ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
		if (ret)
			goto out;
	}

3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046
	ret = btrfs_resume_balance_async(fs_info);
	if (ret)
		goto out;

	ret = btrfs_resume_dev_replace_async(fs_info);
	if (ret) {
		btrfs_warn(fs_info, "failed to resume dev_replace");
		goto out;
	}

	btrfs_qgroup_rescan_resume(fs_info);

	if (!fs_info->uuid_root) {
		btrfs_info(fs_info, "creating UUID tree");
		ret = btrfs_create_uuid_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to create the UUID tree %d", ret);
			goto out;
		}
	}

3047
	ret = btrfs_find_orphan_roots(fs_info);
3048 3049 3050 3051
out:
	return ret;
}

3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
		      char *options)
{
	u32 sectorsize;
	u32 nodesize;
	u32 stripesize;
	u64 generation;
	u64 features;
	u16 csum_type;
	struct btrfs_super_block *disk_super;
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *tree_root;
	struct btrfs_root *chunk_root;
	int ret;
	int err = -EINVAL;
	int level;

3069
	ret = init_mount_fs_info(fs_info, sb);
D
David Woodhouse 已提交
3070
	if (ret) {
3071
		err = ret;
3072
		goto fail;
D
David Woodhouse 已提交
3073 3074
	}

3075 3076 3077 3078 3079 3080 3081 3082 3083
	/* These need to be init'ed before we start creating inodes and such. */
	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
				     GFP_KERNEL);
	fs_info->tree_root = tree_root;
	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
				      GFP_KERNEL);
	fs_info->chunk_root = chunk_root;
	if (!tree_root || !chunk_root) {
		err = -ENOMEM;
J
Josef Bacik 已提交
3084
		goto fail;
3085 3086 3087 3088 3089
	}

	fs_info->btree_inode = new_inode(sb);
	if (!fs_info->btree_inode) {
		err = -ENOMEM;
J
Josef Bacik 已提交
3090
		goto fail;
3091 3092 3093 3094
	}
	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
	btrfs_init_btree_inode(fs_info);

3095
	invalidate_bdev(fs_devices->latest_bdev);
D
David Sterba 已提交
3096 3097 3098 3099

	/*
	 * Read super block and check the signature bytes only
	 */
3100 3101 3102
	disk_super = btrfs_read_dev_super(fs_devices->latest_bdev);
	if (IS_ERR(disk_super)) {
		err = PTR_ERR(disk_super);
3103
		goto fail_alloc;
3104
	}
C
Chris Mason 已提交
3105

3106
	/*
3107
	 * Verify the type first, if that or the checksum value are
3108 3109
	 * corrupted, we'll find out
	 */
3110
	csum_type = btrfs_super_csum_type(disk_super);
3111
	if (!btrfs_supported_super_csum(csum_type)) {
3112
		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3113
			  csum_type);
3114
		err = -EINVAL;
3115
		btrfs_release_disk_super(disk_super);
3116 3117 3118
		goto fail_alloc;
	}

3119 3120 3121
	ret = btrfs_init_csum_hash(fs_info, csum_type);
	if (ret) {
		err = ret;
3122
		btrfs_release_disk_super(disk_super);
3123 3124 3125
		goto fail_alloc;
	}

D
David Sterba 已提交
3126 3127 3128 3129
	/*
	 * We want to check superblock checksum, the type is stored inside.
	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
	 */
3130
	if (btrfs_check_super_csum(fs_info, (u8 *)disk_super)) {
3131
		btrfs_err(fs_info, "superblock checksum mismatch");
D
David Sterba 已提交
3132
		err = -EINVAL;
3133
		btrfs_release_disk_super(disk_super);
3134
		goto fail_alloc;
D
David Sterba 已提交
3135 3136 3137 3138 3139 3140 3141
	}

	/*
	 * super_copy is zeroed at allocation time and we never touch the
	 * following bytes up to INFO_SIZE, the checksum is calculated from
	 * the whole block of INFO_SIZE
	 */
3142 3143
	memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
	btrfs_release_disk_super(disk_super);
3144

3145 3146
	disk_super = fs_info->super_copy;

3147 3148 3149
	ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
		       BTRFS_FSID_SIZE));

3150
	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
3151 3152 3153
		ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
				fs_info->super_copy->metadata_uuid,
				BTRFS_FSID_SIZE));
3154
	}
3155

3156 3157 3158 3159 3160 3161 3162 3163 3164 3165
	features = btrfs_super_flags(disk_super);
	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
		btrfs_set_super_flags(disk_super, features);
		btrfs_info(fs_info,
			"found metadata UUID change in progress flag, clearing");
	}

	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_for_commit));
3166

3167
	ret = btrfs_validate_mount_super(fs_info);
D
David Sterba 已提交
3168
	if (ret) {
3169
		btrfs_err(fs_info, "superblock contains fatal errors");
D
David Sterba 已提交
3170
		err = -EINVAL;
3171
		goto fail_alloc;
D
David Sterba 已提交
3172 3173
	}

3174
	if (!btrfs_super_root(disk_super))
3175
		goto fail_alloc;
3176

L
liubo 已提交
3177
	/* check FS state, whether FS is broken. */
3178 3179
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
L
liubo 已提交
3180

3181 3182 3183 3184 3185 3186
	/*
	 * In the long term, we'll store the compression type in the super
	 * block, and it'll be used for per file compression control.
	 */
	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;

3187
	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
Y
Yan Zheng 已提交
3188 3189
	if (ret) {
		err = ret;
3190
		goto fail_alloc;
Y
Yan Zheng 已提交
3191
	}
3192

3193 3194 3195
	features = btrfs_super_incompat_flags(disk_super) &
		~BTRFS_FEATURE_INCOMPAT_SUPP;
	if (features) {
3196 3197 3198
		btrfs_err(fs_info,
		    "cannot mount because of unsupported optional features (%llx)",
		    features);
3199
		err = -EINVAL;
3200
		goto fail_alloc;
3201 3202
	}

3203
	features = btrfs_super_incompat_flags(disk_super);
L
Li Zefan 已提交
3204
	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
3205
	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
L
Li Zefan 已提交
3206
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
N
Nick Terrell 已提交
3207 3208
	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3209

3210
	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
3211
		btrfs_info(fs_info, "has skinny extents");
3212

3213 3214 3215 3216
	/*
	 * flag our filesystem as having big metadata blocks if
	 * they are bigger than the page size
	 */
3217
	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3218
		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3219 3220
			btrfs_info(fs_info,
				"flagging fs with big metadata feature");
3221 3222 3223
		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
	}

3224 3225
	nodesize = btrfs_super_nodesize(disk_super);
	sectorsize = btrfs_super_sectorsize(disk_super);
3226
	stripesize = sectorsize;
3227
	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3228
	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3229

3230 3231 3232
	/* Cache block sizes */
	fs_info->nodesize = nodesize;
	fs_info->sectorsize = sectorsize;
3233
	fs_info->sectorsize_bits = ilog2(sectorsize);
3234
	fs_info->csum_size = btrfs_super_csum_size(disk_super);
3235
	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
3236 3237
	fs_info->stripesize = stripesize;

3238 3239 3240 3241 3242
	/*
	 * mixed block groups end up with duplicate but slightly offset
	 * extent buffers for the same range.  It leads to corruptions
	 */
	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3243
	    (sectorsize != nodesize)) {
3244 3245 3246
		btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
			nodesize, sectorsize);
3247
		goto fail_alloc;
3248 3249
	}

3250 3251 3252 3253
	/*
	 * Needn't use the lock because there is no other task which will
	 * update the flag.
	 */
L
Li Zefan 已提交
3254
	btrfs_set_super_incompat_flags(disk_super, features);
3255

3256 3257
	features = btrfs_super_compat_ro_flags(disk_super) &
		~BTRFS_FEATURE_COMPAT_RO_SUPP;
3258
	if (!sb_rdonly(sb) && features) {
3259 3260
		btrfs_err(fs_info,
	"cannot mount read-write because of unsupported optional features (%llx)",
3261
		       features);
3262
		err = -EINVAL;
3263
		goto fail_alloc;
3264
	}
3265

3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276
	/* For 4K sector size support, it's only read-only */
	if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) {
		if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) {
			btrfs_err(fs_info,
	"subpage sectorsize %u only supported read-only for page size %lu",
				sectorsize, PAGE_SIZE);
			err = -EINVAL;
			goto fail_alloc;
		}
	}

3277 3278 3279
	ret = btrfs_init_workqueues(fs_info, fs_devices);
	if (ret) {
		err = ret;
3280 3281
		goto fail_sb_buffer;
	}
3282

3283 3284
	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3285

3286 3287
	sb->s_blocksize = sectorsize;
	sb->s_blocksize_bits = blksize_bits(sectorsize);
3288
	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3289

3290
	mutex_lock(&fs_info->chunk_mutex);
3291
	ret = btrfs_read_sys_array(fs_info);
3292
	mutex_unlock(&fs_info->chunk_mutex);
3293
	if (ret) {
3294
		btrfs_err(fs_info, "failed to read the system array: %d", ret);
3295
		goto fail_sb_buffer;
3296
	}
3297

3298
	generation = btrfs_super_chunk_root_generation(disk_super);
3299
	level = btrfs_super_chunk_root_level(disk_super);
3300

3301
	chunk_root->node = read_tree_block(fs_info,
3302
					   btrfs_super_chunk_root(disk_super),
3303
					   BTRFS_CHUNK_TREE_OBJECTID,
3304
					   generation, level, NULL);
3305 3306
	if (IS_ERR(chunk_root->node) ||
	    !extent_buffer_uptodate(chunk_root->node)) {
3307
		btrfs_err(fs_info, "failed to read chunk root");
3308 3309
		if (!IS_ERR(chunk_root->node))
			free_extent_buffer(chunk_root->node);
3310
		chunk_root->node = NULL;
C
Chris Mason 已提交
3311
		goto fail_tree_roots;
3312
	}
3313 3314
	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
	chunk_root->commit_root = btrfs_root_node(chunk_root);
3315

3316
	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3317 3318
			   offsetof(struct btrfs_header, chunk_tree_uuid),
			   BTRFS_UUID_SIZE);
3319

3320
	ret = btrfs_read_chunk_tree(fs_info);
Y
Yan Zheng 已提交
3321
	if (ret) {
3322
		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
C
Chris Mason 已提交
3323
		goto fail_tree_roots;
Y
Yan Zheng 已提交
3324
	}
3325

3326
	/*
3327 3328 3329 3330 3331
	 * At this point we know all the devices that make this filesystem,
	 * including the seed devices but we don't know yet if the replace
	 * target is required. So free devices that are not part of this
	 * filesystem but skip the replace traget device which is checked
	 * below in btrfs_init_dev_replace().
3332
	 */
3333
	btrfs_free_extra_devids(fs_devices);
3334
	if (!fs_devices->latest_bdev) {
3335
		btrfs_err(fs_info, "failed to read devices");
3336 3337 3338
		goto fail_tree_roots;
	}

3339
	ret = init_tree_roots(fs_info);
3340
	if (ret)
3341
		goto fail_tree_roots;
3342

3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355
	/*
	 * Get zone type information of zoned block devices. This will also
	 * handle emulation of a zoned filesystem if a regular device has the
	 * zoned incompat feature flag set.
	 */
	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
	if (ret) {
		btrfs_err(fs_info,
			  "zoned: failed to read device zone info: %d",
			  ret);
		goto fail_block_groups;
	}

3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367
	/*
	 * If we have a uuid root and we're not being told to rescan we need to
	 * check the generation here so we can set the
	 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
	 * transaction during a balance or the log replay without updating the
	 * uuid generation, and then if we crash we would rescan the uuid tree,
	 * even though it was perfectly fine.
	 */
	if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
	    fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);

3368 3369 3370 3371 3372 3373 3374
	ret = btrfs_verify_dev_extents(fs_info);
	if (ret) {
		btrfs_err(fs_info,
			  "failed to verify dev extents against chunks: %d",
			  ret);
		goto fail_block_groups;
	}
3375 3376
	ret = btrfs_recover_balance(fs_info);
	if (ret) {
3377
		btrfs_err(fs_info, "failed to recover balance: %d", ret);
3378 3379 3380
		goto fail_block_groups;
	}

3381 3382
	ret = btrfs_init_dev_stats(fs_info);
	if (ret) {
3383
		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3384 3385 3386
		goto fail_block_groups;
	}

3387 3388
	ret = btrfs_init_dev_replace(fs_info);
	if (ret) {
3389
		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3390 3391 3392
		goto fail_block_groups;
	}

N
Naohiro Aota 已提交
3393 3394 3395 3396 3397 3398 3399
	ret = btrfs_check_zoned_mode(fs_info);
	if (ret) {
		btrfs_err(fs_info, "failed to initialize zoned mode: %d",
			  ret);
		goto fail_block_groups;
	}

3400
	ret = btrfs_sysfs_add_fsid(fs_devices);
3401
	if (ret) {
3402 3403
		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				ret);
3404 3405 3406
		goto fail_block_groups;
	}

3407
	ret = btrfs_sysfs_add_mounted(fs_info);
3408
	if (ret) {
3409
		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3410
		goto fail_fsdev_sysfs;
3411 3412 3413 3414
	}

	ret = btrfs_init_space_info(fs_info);
	if (ret) {
3415
		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3416
		goto fail_sysfs;
3417 3418
	}

3419
	ret = btrfs_read_block_groups(fs_info);
3420
	if (ret) {
3421
		btrfs_err(fs_info, "failed to read block groups: %d", ret);
3422
		goto fail_sysfs;
3423
	}
3424

3425
	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
3426
		btrfs_warn(fs_info,
3427
		"writable mount is not allowed due to too many missing devices");
3428
		goto fail_sysfs;
3429
	}
C
Chris Mason 已提交
3430

3431 3432
	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
					       "btrfs-cleaner");
3433
	if (IS_ERR(fs_info->cleaner_kthread))
3434
		goto fail_sysfs;
3435 3436 3437 3438

	fs_info->transaction_kthread = kthread_run(transaction_kthread,
						   tree_root,
						   "btrfs-transaction");
3439
	if (IS_ERR(fs_info->transaction_kthread))
3440
		goto fail_cleaner;
3441

3442
	if (!btrfs_test_opt(fs_info, NOSSD) &&
C
Chris Mason 已提交
3443
	    !fs_info->fs_devices->rotating) {
3444
		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
C
Chris Mason 已提交
3445 3446
	}

3447
	/*
3448
	 * Mount does not set all options immediately, we can do it now and do
3449 3450 3451
	 * not have to wait for transaction commit
	 */
	btrfs_apply_pending_changes(fs_info);
3452

3453
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3454
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3455
		ret = btrfsic_mount(fs_info, fs_devices,
3456
				    btrfs_test_opt(fs_info,
3457 3458 3459 3460
					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
				    1 : 0,
				    fs_info->check_integrity_print_mask);
		if (ret)
3461 3462 3463
			btrfs_warn(fs_info,
				"failed to initialize integrity check module: %d",
				ret);
3464 3465
	}
#endif
3466 3467 3468
	ret = btrfs_read_qgroup_config(fs_info);
	if (ret)
		goto fail_trans_kthread;
3469

J
Josef Bacik 已提交
3470 3471 3472
	if (btrfs_build_ref_tree(fs_info))
		btrfs_err(fs_info, "couldn't build ref tree");

3473 3474
	/* do not make disk changes in broken FS or nologreplay is given */
	if (btrfs_super_log_root(disk_super) != 0 &&
3475
	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3476
		btrfs_info(fs_info, "start tree-log replay");
3477
		ret = btrfs_replay_log(fs_info, fs_devices);
3478
		if (ret) {
3479
			err = ret;
3480
			goto fail_qgroup;
3481
		}
3482
	}
Z
Zheng Yan 已提交
3483

D
David Sterba 已提交
3484
	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3485 3486
	if (IS_ERR(fs_info->fs_root)) {
		err = PTR_ERR(fs_info->fs_root);
3487
		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3488
		fs_info->fs_root = NULL;
3489
		goto fail_qgroup;
3490
	}
C
Chris Mason 已提交
3491

3492
	if (sb_rdonly(sb))
3493
		goto clear_oneshot;
I
Ilya Dryomov 已提交
3494

3495
	ret = btrfs_start_pre_rw_mount(fs_info);
3496
	if (ret) {
3497
		close_ctree(fs_info);
3498
		return ret;
3499
	}
3500
	btrfs_discard_resume(fs_info);
3501

3502 3503 3504
	if (fs_info->uuid_root &&
	    (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
	     fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3505
		btrfs_info(fs_info, "checking UUID tree");
3506 3507
		ret = btrfs_check_uuid_tree(fs_info);
		if (ret) {
3508 3509
			btrfs_warn(fs_info,
				"failed to check the UUID tree: %d", ret);
3510
			close_ctree(fs_info);
3511 3512
			return ret;
		}
3513
	}
3514

3515
	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3516

3517 3518
clear_oneshot:
	btrfs_clear_oneshot_options(fs_info);
A
Al Viro 已提交
3519
	return 0;
C
Chris Mason 已提交
3520

3521 3522
fail_qgroup:
	btrfs_free_qgroup_config(fs_info);
3523 3524
fail_trans_kthread:
	kthread_stop(fs_info->transaction_kthread);
3525
	btrfs_cleanup_transaction(fs_info);
3526
	btrfs_free_fs_roots(fs_info);
3527
fail_cleaner:
3528
	kthread_stop(fs_info->cleaner_kthread);
3529 3530 3531 3532 3533 3534 3535

	/*
	 * make sure we're done with the btree inode before we stop our
	 * kthreads
	 */
	filemap_write_and_wait(fs_info->btree_inode->i_mapping);

3536
fail_sysfs:
3537
	btrfs_sysfs_remove_mounted(fs_info);
3538

3539 3540 3541
fail_fsdev_sysfs:
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);

3542
fail_block_groups:
J
Josef Bacik 已提交
3543
	btrfs_put_block_group_cache(fs_info);
C
Chris Mason 已提交
3544 3545

fail_tree_roots:
3546 3547
	if (fs_info->data_reloc_root)
		btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3548
	free_root_pointers(fs_info, true);
3549
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
C
Chris Mason 已提交
3550

C
Chris Mason 已提交
3551
fail_sb_buffer:
L
Liu Bo 已提交
3552
	btrfs_stop_all_workers(fs_info);
3553
	btrfs_free_block_groups(fs_info);
3554
fail_alloc:
3555 3556
	btrfs_mapping_tree_free(&fs_info->mapping_tree);

3557
	iput(fs_info->btree_inode);
3558
fail:
3559
	btrfs_close_devices(fs_info->fs_devices);
A
Al Viro 已提交
3560
	return err;
3561
}
3562
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3563

3564
static void btrfs_end_super_write(struct bio *bio)
3565
{
3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588
	struct btrfs_device *device = bio->bi_private;
	struct bio_vec *bvec;
	struct bvec_iter_all iter_all;
	struct page *page;

	bio_for_each_segment_all(bvec, bio, iter_all) {
		page = bvec->bv_page;

		if (bio->bi_status) {
			btrfs_warn_rl_in_rcu(device->fs_info,
				"lost page write due to IO error on %s (%d)",
				rcu_str_deref(device->name),
				blk_status_to_errno(bio->bi_status));
			ClearPageUptodate(page);
			SetPageError(page);
			btrfs_dev_stat_inc_and_print(device,
						     BTRFS_DEV_STAT_WRITE_ERRS);
		} else {
			SetPageUptodate(page);
		}

		put_page(page);
		unlock_page(page);
3589
	}
3590 3591

	bio_put(bio);
3592 3593
}

3594 3595
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
						   int copy_num)
3596 3597
{
	struct btrfs_super_block *super;
3598
	struct page *page;
3599
	u64 bytenr, bytenr_orig;
3600
	struct address_space *mapping = bdev->bd_inode->i_mapping;
3601 3602 3603 3604 3605 3606 3607 3608
	int ret;

	bytenr_orig = btrfs_sb_offset(copy_num);
	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
	if (ret == -ENOENT)
		return ERR_PTR(-EINVAL);
	else if (ret)
		return ERR_PTR(ret);
3609 3610

	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3611
		return ERR_PTR(-EINVAL);
3612

3613 3614 3615
	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
	if (IS_ERR(page))
		return ERR_CAST(page);
3616

3617
	super = page_address(page);
3618 3619 3620 3621 3622
	if (btrfs_super_magic(super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(super);
		return ERR_PTR(-ENODATA);
	}

3623
	if (btrfs_super_bytenr(super) != bytenr_orig) {
3624 3625
		btrfs_release_disk_super(super);
		return ERR_PTR(-EINVAL);
3626 3627
	}

3628
	return super;
3629 3630 3631
}


3632
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
Y
Yan Zheng 已提交
3633
{
3634
	struct btrfs_super_block *super, *latest = NULL;
Y
Yan Zheng 已提交
3635 3636 3637 3638 3639 3640 3641 3642 3643
	int i;
	u64 transid = 0;

	/* we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	for (i = 0; i < 1; i++) {
3644 3645
		super = btrfs_read_dev_one_super(bdev, i);
		if (IS_ERR(super))
Y
Yan Zheng 已提交
3646 3647 3648
			continue;

		if (!latest || btrfs_super_generation(super) > transid) {
3649 3650 3651 3652
			if (latest)
				btrfs_release_disk_super(super);

			latest = super;
Y
Yan Zheng 已提交
3653 3654 3655
			transid = btrfs_super_generation(super);
		}
	}
3656

3657
	return super;
Y
Yan Zheng 已提交
3658 3659
}

3660
/*
3661
 * Write superblock @sb to the @device. Do not wait for completion, all the
3662
 * pages we use for writing are locked.
3663
 *
3664 3665 3666
 * Write @max_mirrors copies of the superblock, where 0 means default that fit
 * the expected device size at commit time. Note that max_mirrors must be
 * same for write and wait phases.
3667
 *
3668
 * Return number of errors when page is not found or submission fails.
3669
 */
Y
Yan Zheng 已提交
3670
static int write_dev_supers(struct btrfs_device *device,
3671
			    struct btrfs_super_block *sb, int max_mirrors)
Y
Yan Zheng 已提交
3672
{
3673
	struct btrfs_fs_info *fs_info = device->fs_info;
3674
	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3675
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Y
Yan Zheng 已提交
3676 3677
	int i;
	int errors = 0;
3678 3679
	int ret;
	u64 bytenr, bytenr_orig;
Y
Yan Zheng 已提交
3680 3681 3682 3683

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

3684 3685
	shash->tfm = fs_info->csum_shash;

Y
Yan Zheng 已提交
3686
	for (i = 0; i < max_mirrors; i++) {
3687 3688 3689 3690
		struct page *page;
		struct bio *bio;
		struct btrfs_super_block *disk_super;

3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701
		bytenr_orig = btrfs_sb_offset(i);
		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
		if (ret == -ENOENT) {
			continue;
		} else if (ret < 0) {
			btrfs_err(device->fs_info,
				"couldn't get super block location for mirror %d",
				i);
			errors++;
			continue;
		}
3702 3703
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
Y
Yan Zheng 已提交
3704 3705
			break;

3706
		btrfs_set_super_bytenr(sb, bytenr_orig);
3707

3708 3709 3710
		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
				    sb->csum);
3711

3712 3713 3714
		page = find_or_create_page(mapping, bytenr >> PAGE_SHIFT,
					   GFP_NOFS);
		if (!page) {
3715
			btrfs_err(device->fs_info,
3716
			    "couldn't get super block page for bytenr %llu",
3717 3718
			    bytenr);
			errors++;
3719
			continue;
3720
		}
3721

3722 3723
		/* Bump the refcount for wait_dev_supers() */
		get_page(page);
Y
Yan Zheng 已提交
3724

3725 3726
		disk_super = page_address(page);
		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3727

3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739
		/*
		 * Directly use bios here instead of relying on the page cache
		 * to do I/O, so we don't lose the ability to do integrity
		 * checking.
		 */
		bio = bio_alloc(GFP_NOFS, 1);
		bio_set_dev(bio, device->bdev);
		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
		bio->bi_private = device;
		bio->bi_end_io = btrfs_end_super_write;
		__bio_add_page(bio, page, BTRFS_SUPER_INFO_SIZE,
			       offset_in_page(bytenr));
Y
Yan Zheng 已提交
3740

C
Chris Mason 已提交
3741
		/*
3742 3743 3744
		 * We FUA only the first super block.  The others we allow to
		 * go down lazy and there's a short window where the on-disk
		 * copies might still contain the older version.
C
Chris Mason 已提交
3745
		 */
3746
		bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO;
3747
		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3748 3749 3750
			bio->bi_opf |= REQ_FUA;

		btrfsic_submit_bio(bio);
3751
		btrfs_advance_sb_log(device, i);
Y
Yan Zheng 已提交
3752 3753 3754 3755
	}
	return errors < i ? 0 : -1;
}

3756 3757 3758 3759
/*
 * Wait for write completion of superblocks done by write_dev_supers,
 * @max_mirrors same for write and wait phases.
 *
3760
 * Return number of errors when page is not found or not marked up to
3761 3762 3763 3764 3765 3766
 * date.
 */
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
	int i;
	int errors = 0;
3767
	bool primary_failed = false;
3768
	int ret;
3769 3770 3771 3772 3773 3774
	u64 bytenr;

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

	for (i = 0; i < max_mirrors; i++) {
3775 3776
		struct page *page;

3777 3778 3779 3780 3781 3782 3783 3784 3785
		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
		if (ret == -ENOENT) {
			break;
		} else if (ret < 0) {
			errors++;
			if (i == 0)
				primary_failed = true;
			continue;
		}
3786 3787 3788 3789
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
			break;

3790 3791 3792
		page = find_get_page(device->bdev->bd_inode->i_mapping,
				     bytenr >> PAGE_SHIFT);
		if (!page) {
3793
			errors++;
3794 3795
			if (i == 0)
				primary_failed = true;
3796 3797
			continue;
		}
3798 3799 3800
		/* Page is submitted locked and unlocked once the IO completes */
		wait_on_page_locked(page);
		if (PageError(page)) {
3801
			errors++;
3802 3803 3804
			if (i == 0)
				primary_failed = true;
		}
3805

3806 3807
		/* Drop our reference */
		put_page(page);
3808

3809 3810
		/* Drop the reference from the writing run */
		put_page(page);
3811 3812
	}

3813 3814 3815 3816 3817 3818 3819
	/* log error, force error return */
	if (primary_failed) {
		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
			  device->devid);
		return -1;
	}

3820 3821 3822
	return errors < i ? 0 : -1;
}

C
Chris Mason 已提交
3823 3824 3825 3826
/*
 * endio for the write_dev_flush, this will wake anyone waiting
 * for the barrier when it is done
 */
3827
static void btrfs_end_empty_barrier(struct bio *bio)
C
Chris Mason 已提交
3828
{
3829
	complete(bio->bi_private);
C
Chris Mason 已提交
3830 3831 3832
}

/*
3833 3834
 * Submit a flush request to the device if it supports it. Error handling is
 * done in the waiting counterpart.
C
Chris Mason 已提交
3835
 */
3836
static void write_dev_flush(struct btrfs_device *device)
C
Chris Mason 已提交
3837
{
3838
	struct request_queue *q = bdev_get_queue(device->bdev);
3839
	struct bio *bio = device->flush_bio;
C
Chris Mason 已提交
3840

3841
	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3842
		return;
C
Chris Mason 已提交
3843

3844
	bio_reset(bio);
C
Chris Mason 已提交
3845
	bio->bi_end_io = btrfs_end_empty_barrier;
3846
	bio_set_dev(bio, device->bdev);
3847
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
C
Chris Mason 已提交
3848 3849 3850
	init_completion(&device->flush_wait);
	bio->bi_private = &device->flush_wait;

3851
	btrfsic_submit_bio(bio);
3852
	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3853
}
C
Chris Mason 已提交
3854

3855 3856 3857
/*
 * If the flush bio has been submitted by write_dev_flush, wait for it.
 */
3858
static blk_status_t wait_dev_flush(struct btrfs_device *device)
3859 3860
{
	struct bio *bio = device->flush_bio;
C
Chris Mason 已提交
3861

3862
	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3863
		return BLK_STS_OK;
C
Chris Mason 已提交
3864

3865
	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3866
	wait_for_completion_io(&device->flush_wait);
C
Chris Mason 已提交
3867

3868
	return bio->bi_status;
C
Chris Mason 已提交
3869 3870
}

3871
static int check_barrier_error(struct btrfs_fs_info *fs_info)
3872
{
3873
	if (!btrfs_check_rw_degradable(fs_info, NULL))
3874
		return -EIO;
C
Chris Mason 已提交
3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885
	return 0;
}

/*
 * send an empty flush down to each device in parallel,
 * then wait for them
 */
static int barrier_all_devices(struct btrfs_fs_info *info)
{
	struct list_head *head;
	struct btrfs_device *dev;
3886
	int errors_wait = 0;
3887
	blk_status_t ret;
C
Chris Mason 已提交
3888

3889
	lockdep_assert_held(&info->fs_devices->device_list_mutex);
C
Chris Mason 已提交
3890 3891
	/* send down all the barriers */
	head = &info->fs_devices->devices;
3892
	list_for_each_entry(dev, head, dev_list) {
3893
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3894
			continue;
3895
		if (!dev->bdev)
C
Chris Mason 已提交
3896
			continue;
3897
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3898
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
C
Chris Mason 已提交
3899 3900
			continue;

3901
		write_dev_flush(dev);
3902
		dev->last_flush_error = BLK_STS_OK;
C
Chris Mason 已提交
3903 3904 3905
	}

	/* wait for all the barriers */
3906
	list_for_each_entry(dev, head, dev_list) {
3907
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3908
			continue;
C
Chris Mason 已提交
3909
		if (!dev->bdev) {
3910
			errors_wait++;
C
Chris Mason 已提交
3911 3912
			continue;
		}
3913
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3914
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
C
Chris Mason 已提交
3915 3916
			continue;

3917
		ret = wait_dev_flush(dev);
3918 3919
		if (ret) {
			dev->last_flush_error = ret;
3920 3921
			btrfs_dev_stat_inc_and_print(dev,
					BTRFS_DEV_STAT_FLUSH_ERRS);
3922
			errors_wait++;
3923 3924 3925
		}
	}

3926
	if (errors_wait) {
3927 3928 3929 3930 3931
		/*
		 * At some point we need the status of all disks
		 * to arrive at the volume status. So error checking
		 * is being pushed to a separate loop.
		 */
3932
		return check_barrier_error(info);
C
Chris Mason 已提交
3933 3934 3935 3936
	}
	return 0;
}

3937 3938
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
3939 3940
	int raid_type;
	int min_tolerated = INT_MAX;
3941

3942 3943
	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3944
		min_tolerated = min_t(int, min_tolerated,
3945 3946
				    btrfs_raid_array[BTRFS_RAID_SINGLE].
				    tolerated_failures);
3947

3948 3949 3950
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (raid_type == BTRFS_RAID_SINGLE)
			continue;
3951
		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3952
			continue;
3953
		min_tolerated = min_t(int, min_tolerated,
3954 3955 3956
				    btrfs_raid_array[raid_type].
				    tolerated_failures);
	}
3957

3958
	if (min_tolerated == INT_MAX) {
3959
		pr_warn("BTRFS: unknown raid flag: %llu", flags);
3960 3961 3962 3963
		min_tolerated = 0;
	}

	return min_tolerated;
3964 3965
}

3966
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3967
{
3968
	struct list_head *head;
3969
	struct btrfs_device *dev;
3970
	struct btrfs_super_block *sb;
3971 3972 3973
	struct btrfs_dev_item *dev_item;
	int ret;
	int do_barriers;
3974 3975
	int max_errors;
	int total_errors = 0;
3976
	u64 flags;
3977

3978
	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3979 3980 3981 3982 3983 3984 3985 3986

	/*
	 * max_mirrors == 0 indicates we're from commit_transaction,
	 * not from fsync where the tree roots in fs_info have not
	 * been consistent on disk.
	 */
	if (max_mirrors == 0)
		backup_super_roots(fs_info);
3987

3988
	sb = fs_info->super_for_commit;
3989
	dev_item = &sb->dev_item;
3990

3991 3992 3993
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	head = &fs_info->fs_devices->devices;
	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
C
Chris Mason 已提交
3994

3995
	if (do_barriers) {
3996
		ret = barrier_all_devices(fs_info);
3997 3998
		if (ret) {
			mutex_unlock(
3999 4000 4001
				&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, ret,
					      "errors while submitting device barriers.");
4002 4003 4004
			return ret;
		}
	}
C
Chris Mason 已提交
4005

4006
	list_for_each_entry(dev, head, dev_list) {
4007 4008 4009 4010
		if (!dev->bdev) {
			total_errors++;
			continue;
		}
4011
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4012
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4013 4014
			continue;

Y
Yan Zheng 已提交
4015
		btrfs_set_stack_device_generation(dev_item, 0);
4016 4017
		btrfs_set_stack_device_type(dev_item, dev->type);
		btrfs_set_stack_device_id(dev_item, dev->devid);
4018
		btrfs_set_stack_device_total_bytes(dev_item,
4019
						   dev->commit_total_bytes);
4020 4021
		btrfs_set_stack_device_bytes_used(dev_item,
						  dev->commit_bytes_used);
4022 4023 4024 4025
		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4026 4027
		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
		       BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
4028

4029 4030 4031
		flags = btrfs_super_flags(sb);
		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);

4032 4033 4034 4035 4036 4037 4038 4039
		ret = btrfs_validate_write_super(fs_info, sb);
		if (ret < 0) {
			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, -EUCLEAN,
				"unexpected superblock corruption detected");
			return -EUCLEAN;
		}

4040
		ret = write_dev_supers(dev, sb, max_mirrors);
4041 4042
		if (ret)
			total_errors++;
4043
	}
4044
	if (total_errors > max_errors) {
4045 4046 4047
		btrfs_err(fs_info, "%d errors while writing supers",
			  total_errors);
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4048

4049
		/* FUA is masked off if unsupported and can't be the reason */
4050 4051 4052
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
4053
		return -EIO;
4054
	}
4055

Y
Yan Zheng 已提交
4056
	total_errors = 0;
4057
	list_for_each_entry(dev, head, dev_list) {
4058 4059
		if (!dev->bdev)
			continue;
4060
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4061
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4062 4063
			continue;

4064
		ret = wait_dev_supers(dev, max_mirrors);
Y
Yan Zheng 已提交
4065 4066
		if (ret)
			total_errors++;
4067
	}
4068
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4069
	if (total_errors > max_errors) {
4070 4071 4072
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
4073
		return -EIO;
4074
	}
4075 4076 4077
	return 0;
}

4078 4079 4080
/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				  struct btrfs_root *root)
C
Chris Mason 已提交
4081
{
4082 4083
	bool drop_ref = false;

4084
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
4085 4086
	radix_tree_delete(&fs_info->fs_roots_radix,
			  (unsigned long)root->root_key.objectid);
4087
	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4088
		drop_ref = true;
4089
	spin_unlock(&fs_info->fs_roots_radix_lock);
4090

L
Liu Bo 已提交
4091
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
4092
		ASSERT(root->log_root == NULL);
L
Liu Bo 已提交
4093
		if (root->reloc_root) {
4094
			btrfs_put_root(root->reloc_root);
L
Liu Bo 已提交
4095 4096 4097
			root->reloc_root = NULL;
		}
	}
L
Liu Bo 已提交
4098

4099 4100
	if (drop_ref)
		btrfs_put_root(root);
C
Chris Mason 已提交
4101 4102
}

Y
Yan Zheng 已提交
4103
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
4104
{
Y
Yan Zheng 已提交
4105 4106
	u64 root_objectid = 0;
	struct btrfs_root *gang[8];
4107 4108 4109
	int i = 0;
	int err = 0;
	unsigned int ret = 0;
4110

Y
Yan Zheng 已提交
4111
	while (1) {
4112
		spin_lock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
4113 4114 4115
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang));
4116
		if (!ret) {
4117
			spin_unlock(&fs_info->fs_roots_radix_lock);
Y
Yan Zheng 已提交
4118
			break;
4119
		}
4120
		root_objectid = gang[ret - 1]->root_key.objectid + 1;
4121

Y
Yan Zheng 已提交
4122
		for (i = 0; i < ret; i++) {
4123 4124 4125 4126 4127 4128
			/* Avoid to grab roots in dead_roots */
			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				gang[i] = NULL;
				continue;
			}
			/* grab all the search result for later use */
4129
			gang[i] = btrfs_grab_root(gang[i]);
4130
		}
4131
		spin_unlock(&fs_info->fs_roots_radix_lock);
4132

4133 4134 4135
		for (i = 0; i < ret; i++) {
			if (!gang[i])
				continue;
Y
Yan Zheng 已提交
4136
			root_objectid = gang[i]->root_key.objectid;
4137 4138
			err = btrfs_orphan_cleanup(gang[i]);
			if (err)
4139
				break;
4140
			btrfs_put_root(gang[i]);
Y
Yan Zheng 已提交
4141 4142 4143
		}
		root_objectid++;
	}
4144 4145 4146 4147

	/* release the uncleaned roots due to error */
	for (; i < ret; i++) {
		if (gang[i])
4148
			btrfs_put_root(gang[i]);
4149 4150
	}
	return err;
Y
Yan Zheng 已提交
4151
}
4152

4153
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
4154
{
4155
	struct btrfs_root *root = fs_info->tree_root;
Y
Yan Zheng 已提交
4156
	struct btrfs_trans_handle *trans;
4157

4158
	mutex_lock(&fs_info->cleaner_mutex);
4159
	btrfs_run_delayed_iputs(fs_info);
4160 4161
	mutex_unlock(&fs_info->cleaner_mutex);
	wake_up_process(fs_info->cleaner_kthread);
4162 4163

	/* wait until ongoing cleanup work done */
4164 4165
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
4166

4167
	trans = btrfs_join_transaction(root);
4168 4169
	if (IS_ERR(trans))
		return PTR_ERR(trans);
4170
	return btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
4171 4172
}

4173
void __cold close_ctree(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
4174 4175 4176
{
	int ret;

4177
	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4178 4179 4180 4181 4182 4183 4184
	/*
	 * We don't want the cleaner to start new transactions, add more delayed
	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
	 * because that frees the task_struct, and the transaction kthread might
	 * still try to wake up the cleaner.
	 */
	kthread_park(fs_info->cleaner_kthread);
Y
Yan Zheng 已提交
4185

4186
	/* wait for the qgroup rescan worker to stop */
4187
	btrfs_qgroup_wait_for_completion(fs_info, false);
4188

S
Stefan Behrens 已提交
4189 4190 4191 4192 4193
	/* wait for the uuid_scan task to finish */
	down(&fs_info->uuid_tree_rescan_sem);
	/* avoid complains from lockdep et al., set sem back to initial state */
	up(&fs_info->uuid_tree_rescan_sem);

4194
	/* pause restriper - we want to resume on mount */
4195
	btrfs_pause_balance(fs_info);
4196

4197 4198
	btrfs_dev_replace_suspend_for_unmount(fs_info);

4199
	btrfs_scrub_cancel(fs_info);
C
Chris Mason 已提交
4200 4201 4202 4203 4204 4205

	/* wait for any defraggers to finish */
	wait_event(fs_info->transaction_wait,
		   (atomic_read(&fs_info->defrag_running) == 0));

	/* clear out the rbtree of defraggable inodes */
4206
	btrfs_cleanup_defrag_inodes(fs_info);
C
Chris Mason 已提交
4207

4208
	cancel_work_sync(&fs_info->async_reclaim_work);
4209
	cancel_work_sync(&fs_info->async_data_reclaim_work);
4210
	cancel_work_sync(&fs_info->preempt_reclaim_work);
4211

4212 4213 4214
	/* Cancel or finish ongoing discard work */
	btrfs_discard_cleanup(fs_info);

4215
	if (!sb_rdonly(fs_info->sb)) {
4216
		/*
4217 4218
		 * The cleaner kthread is stopped, so do one final pass over
		 * unused block groups.
4219
		 */
4220
		btrfs_delete_unused_bgs(fs_info);
4221

4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234
		/*
		 * There might be existing delayed inode workers still running
		 * and holding an empty delayed inode item. We must wait for
		 * them to complete first because they can create a transaction.
		 * This happens when someone calls btrfs_balance_delayed_items()
		 * and then a transaction commit runs the same delayed nodes
		 * before any delayed worker has done something with the nodes.
		 * We must wait for any worker here and not at transaction
		 * commit time since that could cause a deadlock.
		 * This is a very rare case.
		 */
		btrfs_flush_workqueue(fs_info->delayed_workers);

4235
		ret = btrfs_commit_super(fs_info);
L
liubo 已提交
4236
		if (ret)
4237
			btrfs_err(fs_info, "commit super ret %d", ret);
L
liubo 已提交
4238 4239
	}

4240 4241
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
4242
		btrfs_error_commit_super(fs_info);
4243

A
Al Viro 已提交
4244 4245
	kthread_stop(fs_info->transaction_kthread);
	kthread_stop(fs_info->cleaner_kthread);
4246

4247
	ASSERT(list_empty(&fs_info->delayed_iputs));
4248
	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4249

4250 4251 4252 4253 4254
	if (btrfs_check_quota_leak(fs_info)) {
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
		btrfs_err(fs_info, "qgroup reserved space leaked");
	}

4255
	btrfs_free_qgroup_config(fs_info);
4256
	ASSERT(list_empty(&fs_info->delalloc_roots));
4257

4258
	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4259
		btrfs_info(fs_info, "at unmount delalloc count %lld",
4260
		       percpu_counter_sum(&fs_info->delalloc_bytes));
C
Chris Mason 已提交
4261
	}
4262

4263
	if (percpu_counter_sum(&fs_info->ordered_bytes))
J
Josef Bacik 已提交
4264
		btrfs_info(fs_info, "at unmount dio bytes count %lld",
4265
			   percpu_counter_sum(&fs_info->ordered_bytes));
J
Josef Bacik 已提交
4266

4267
	btrfs_sysfs_remove_mounted(fs_info);
4268
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4269

4270 4271
	btrfs_put_block_group_cache(fs_info);

4272 4273 4274 4275 4276
	/*
	 * we must make sure there is not any read request to
	 * submit after we stopping all workers.
	 */
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4277 4278
	btrfs_stop_all_workers(fs_info);

4279 4280 4281
	/* We shouldn't have any transaction open at this point */
	ASSERT(list_empty(&fs_info->trans_list));

4282
	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4283
	free_root_pointers(fs_info, true);
4284
	btrfs_free_fs_roots(fs_info);
4285

4286 4287 4288 4289 4290 4291 4292 4293 4294
	/*
	 * We must free the block groups after dropping the fs_roots as we could
	 * have had an IO error and have left over tree log blocks that aren't
	 * cleaned up until the fs roots are freed.  This makes the block group
	 * accounting appear to be wrong because there's pending reserved bytes,
	 * so make sure we do the block group cleanup afterwards.
	 */
	btrfs_free_block_groups(fs_info);

4295
	iput(fs_info->btree_inode);
4296

4297
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4298
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4299
		btrfsic_unmount(fs_info->fs_devices);
4300 4301
#endif

4302
	btrfs_mapping_tree_free(&fs_info->mapping_tree);
4303
	btrfs_close_devices(fs_info->fs_devices);
4304 4305
}

4306 4307
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
			  int atomic)
4308
{
4309
	int ret;
4310
	struct inode *btree_inode = buf->pages[0]->mapping->host;
4311

4312
	ret = extent_buffer_uptodate(buf);
4313 4314 4315 4316
	if (!ret)
		return ret;

	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4317 4318 4319
				    parent_transid, atomic);
	if (ret == -EAGAIN)
		return ret;
4320
	return !ret;
4321 4322 4323 4324
}

void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
4325
	struct btrfs_fs_info *fs_info = buf->fs_info;
4326
	u64 transid = btrfs_header_generation(buf);
4327
	int was_dirty;
4328

4329 4330 4331
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
	/*
	 * This is a fast path so only do this check if we have sanity tests
4332
	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4333 4334
	 * outside of the sanity tests.
	 */
4335
	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4336 4337
		return;
#endif
4338
	btrfs_assert_tree_locked(buf);
4339
	if (transid != fs_info->generation)
J
Jeff Mahoney 已提交
4340
		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4341
			buf->start, transid, fs_info->generation);
4342
	was_dirty = set_extent_buffer_dirty(buf);
4343
	if (!was_dirty)
4344 4345 4346
		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
					 buf->len,
					 fs_info->dirty_metadata_batch);
4347
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4348 4349 4350 4351 4352 4353
	/*
	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
	 * but item data not updated.
	 * So here we should only check item pointers, not item data.
	 */
	if (btrfs_header_level(buf) == 0 &&
4354
	    btrfs_check_leaf_relaxed(buf)) {
4355
		btrfs_print_leaf(buf);
4356 4357 4358
		ASSERT(0);
	}
#endif
4359 4360
}

4361
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4362
					int flush_delayed)
4363 4364 4365 4366 4367
{
	/*
	 * looks as though older kernels can get into trouble with
	 * this code, they end up stuck in balance_dirty_pages forever
	 */
4368
	int ret;
4369 4370 4371 4372

	if (current->flags & PF_MEMALLOC)
		return;

4373
	if (flush_delayed)
4374
		btrfs_balance_delayed_items(fs_info);
4375

4376 4377 4378
	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				     BTRFS_DIRTY_METADATA_THRESH,
				     fs_info->dirty_metadata_batch);
4379
	if (ret > 0) {
4380
		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4381 4382 4383
	}
}

4384
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
4385
{
4386
	__btrfs_btree_balance_dirty(fs_info, 1);
4387
}
4388

4389
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4390
{
4391
	__btrfs_btree_balance_dirty(fs_info, 0);
C
Chris Mason 已提交
4392
}
4393

4394 4395
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
		      struct btrfs_key *first_key)
4396
{
4397
	return btree_read_extent_buffer_pages(buf, parent_transid,
4398
					      level, first_key);
4399
}
4400

4401
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
L
liubo 已提交
4402
{
4403 4404 4405
	/* cleanup FS via transaction */
	btrfs_cleanup_transaction(fs_info);

4406
	mutex_lock(&fs_info->cleaner_mutex);
4407
	btrfs_run_delayed_iputs(fs_info);
4408
	mutex_unlock(&fs_info->cleaner_mutex);
L
liubo 已提交
4409

4410 4411
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
L
liubo 已提交
4412 4413
}

4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *gang[8];
	u64 root_objectid = 0;
	int ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang))) != 0) {
		int i;

		for (i = 0; i < ret; i++)
			gang[i] = btrfs_grab_root(gang[i]);
		spin_unlock(&fs_info->fs_roots_radix_lock);

		for (i = 0; i < ret; i++) {
			if (!gang[i])
				continue;
			root_objectid = gang[i]->root_key.objectid;
			btrfs_free_log(NULL, gang[i]);
			btrfs_put_root(gang[i]);
		}
		root_objectid++;
		spin_lock(&fs_info->fs_roots_radix_lock);
	}
	spin_unlock(&fs_info->fs_roots_radix_lock);
	btrfs_free_log_root_tree(NULL, fs_info);
}

4444
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
L
liubo 已提交
4445 4446 4447
{
	struct btrfs_ordered_extent *ordered;

4448
	spin_lock(&root->ordered_extent_lock);
4449 4450 4451 4452
	/*
	 * This will just short circuit the ordered completion stuff which will
	 * make sure the ordered extent gets properly cleaned up.
	 */
4453
	list_for_each_entry(ordered, &root->ordered_extents,
4454 4455
			    root_extent_list)
		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
	spin_unlock(&root->ordered_extent_lock);
}

static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
4471 4472
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
4473

4474
		spin_unlock(&fs_info->ordered_root_lock);
4475 4476
		btrfs_destroy_ordered_extents(root);

4477 4478
		cond_resched();
		spin_lock(&fs_info->ordered_root_lock);
4479 4480
	}
	spin_unlock(&fs_info->ordered_root_lock);
4481 4482 4483 4484 4485 4486 4487 4488

	/*
	 * We need this here because if we've been flipped read-only we won't
	 * get sync() from the umount, so we need to make sure any ordered
	 * extents that haven't had their dirty pages IO start writeout yet
	 * actually get run and error out properly.
	 */
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
L
liubo 已提交
4489 4490
}

4491
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4492
				      struct btrfs_fs_info *fs_info)
L
liubo 已提交
4493 4494 4495 4496 4497 4498 4499 4500 4501
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_node *ref;
	int ret = 0;

	delayed_refs = &trans->delayed_refs;

	spin_lock(&delayed_refs->lock);
4502
	if (atomic_read(&delayed_refs->num_entries) == 0) {
4503
		spin_unlock(&delayed_refs->lock);
4504
		btrfs_debug(fs_info, "delayed_refs has NO entry");
L
liubo 已提交
4505 4506 4507
		return ret;
	}

4508
	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4509
		struct btrfs_delayed_ref_head *head;
4510
		struct rb_node *n;
4511
		bool pin_bytes = false;
L
liubo 已提交
4512

4513 4514
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
4515
		if (btrfs_delayed_ref_lock(delayed_refs, head))
4516
			continue;
4517

4518
		spin_lock(&head->lock);
4519
		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4520 4521
			ref = rb_entry(n, struct btrfs_delayed_ref_node,
				       ref_node);
4522
			ref->in_tree = 0;
4523
			rb_erase_cached(&ref->ref_node, &head->ref_tree);
4524
			RB_CLEAR_NODE(&ref->ref_node);
4525 4526
			if (!list_empty(&ref->add_list))
				list_del(&ref->add_list);
4527 4528
			atomic_dec(&delayed_refs->num_entries);
			btrfs_put_delayed_ref(ref);
4529
		}
4530 4531 4532
		if (head->must_insert_reserved)
			pin_bytes = true;
		btrfs_free_delayed_extent_op(head->extent_op);
4533
		btrfs_delete_ref_head(delayed_refs, head);
4534 4535 4536
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		mutex_unlock(&head->mutex);
L
liubo 已提交
4537

4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561
		if (pin_bytes) {
			struct btrfs_block_group *cache;

			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
			BUG_ON(!cache);

			spin_lock(&cache->space_info->lock);
			spin_lock(&cache->lock);
			cache->pinned += head->num_bytes;
			btrfs_space_info_update_bytes_pinned(fs_info,
				cache->space_info, head->num_bytes);
			cache->reserved -= head->num_bytes;
			cache->space_info->bytes_reserved -= head->num_bytes;
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);
			percpu_counter_add_batch(
				&cache->space_info->total_bytes_pinned,
				head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);

			btrfs_put_block_group(cache);

			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
				head->bytenr + head->num_bytes - 1);
		}
4562
		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4563
		btrfs_put_delayed_ref_head(head);
L
liubo 已提交
4564 4565 4566
		cond_resched();
		spin_lock(&delayed_refs->lock);
	}
4567
	btrfs_qgroup_destroy_extent_records(trans);
L
liubo 已提交
4568 4569 4570 4571 4572 4573

	spin_unlock(&delayed_refs->lock);

	return ret;
}

4574
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
L
liubo 已提交
4575 4576 4577 4578 4579 4580
{
	struct btrfs_inode *btrfs_inode;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

4581 4582
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
L
liubo 已提交
4583 4584

	while (!list_empty(&splice)) {
4585
		struct inode *inode = NULL;
4586 4587
		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
					       delalloc_inodes);
4588
		__btrfs_del_delalloc_inode(root, btrfs_inode);
4589
		spin_unlock(&root->delalloc_lock);
L
liubo 已提交
4590

4591 4592 4593 4594 4595 4596 4597 4598 4599
		/*
		 * Make sure we get a live inode and that it'll not disappear
		 * meanwhile.
		 */
		inode = igrab(&btrfs_inode->vfs_inode);
		if (inode) {
			invalidate_inode_pages2(inode->i_mapping);
			iput(inode);
		}
4600
		spin_lock(&root->delalloc_lock);
L
liubo 已提交
4601
	}
4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616
	spin_unlock(&root->delalloc_lock);
}

static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					 delalloc_root);
4617
		root = btrfs_grab_root(root);
4618 4619 4620 4621
		BUG_ON(!root);
		spin_unlock(&fs_info->delalloc_root_lock);

		btrfs_destroy_delalloc_inodes(root);
4622
		btrfs_put_root(root);
4623 4624 4625 4626

		spin_lock(&fs_info->delalloc_root_lock);
	}
	spin_unlock(&fs_info->delalloc_root_lock);
L
liubo 已提交
4627 4628
}

4629
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
L
liubo 已提交
4630 4631 4632 4633 4634 4635 4636 4637 4638 4639
					struct extent_io_tree *dirty_pages,
					int mark)
{
	int ret;
	struct extent_buffer *eb;
	u64 start = 0;
	u64 end;

	while (1) {
		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4640
					    mark, NULL);
L
liubo 已提交
4641 4642 4643
		if (ret)
			break;

4644
		clear_extent_bits(dirty_pages, start, end, mark);
L
liubo 已提交
4645
		while (start <= end) {
4646 4647
			eb = find_extent_buffer(fs_info, start);
			start += fs_info->nodesize;
4648
			if (!eb)
L
liubo 已提交
4649
				continue;
4650
			wait_on_extent_buffer_writeback(eb);
L
liubo 已提交
4651

4652 4653 4654 4655
			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
					       &eb->bflags))
				clear_extent_buffer_dirty(eb);
			free_extent_buffer_stale(eb);
L
liubo 已提交
4656 4657 4658 4659 4660 4661
		}
	}

	return ret;
}

4662
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
4663
				       struct extent_io_tree *unpin)
L
liubo 已提交
4664 4665 4666 4667 4668 4669
{
	u64 start;
	u64 end;
	int ret;

	while (1) {
4670 4671
		struct extent_state *cached_state = NULL;

4672 4673 4674 4675 4676 4677 4678
		/*
		 * The btrfs_finish_extent_commit() may get the same range as
		 * ours between find_first_extent_bit and clear_extent_dirty.
		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
		 * the same extent range.
		 */
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4679
		ret = find_first_extent_bit(unpin, 0, &start, &end,
4680
					    EXTENT_DIRTY, &cached_state);
4681 4682
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4683
			break;
4684
		}
L
liubo 已提交
4685

4686 4687
		clear_extent_dirty(unpin, start, end, &cached_state);
		free_extent_state(cached_state);
4688
		btrfs_error_unpin_extent_range(fs_info, start, end);
4689
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4690 4691 4692 4693 4694 4695
		cond_resched();
	}

	return 0;
}

4696
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4697 4698 4699 4700 4701 4702 4703 4704 4705 4706
{
	struct inode *inode;

	inode = cache->io_ctl.inode;
	if (inode) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
		cache->io_ctl.inode = NULL;
		iput(inode);
	}
4707
	ASSERT(cache->io_ctl.pages == NULL);
4708 4709 4710 4711
	btrfs_put_block_group(cache);
}

void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4712
			     struct btrfs_fs_info *fs_info)
4713
{
4714
	struct btrfs_block_group *cache;
4715 4716 4717 4718

	spin_lock(&cur_trans->dirty_bgs_lock);
	while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
4719
					 struct btrfs_block_group,
4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735
					 dirty_list);

		if (!list_empty(&cache->io_list)) {
			spin_unlock(&cur_trans->dirty_bgs_lock);
			list_del_init(&cache->io_list);
			btrfs_cleanup_bg_io(cache);
			spin_lock(&cur_trans->dirty_bgs_lock);
		}

		list_del_init(&cache->dirty_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);

		spin_unlock(&cur_trans->dirty_bgs_lock);
		btrfs_put_block_group(cache);
J
Josef Bacik 已提交
4736
		btrfs_delayed_refs_rsv_release(fs_info, 1);
4737 4738 4739 4740
		spin_lock(&cur_trans->dirty_bgs_lock);
	}
	spin_unlock(&cur_trans->dirty_bgs_lock);

4741 4742 4743 4744
	/*
	 * Refer to the definition of io_bgs member for details why it's safe
	 * to use it without any locking
	 */
4745 4746
	while (!list_empty(&cur_trans->io_bgs)) {
		cache = list_first_entry(&cur_trans->io_bgs,
4747
					 struct btrfs_block_group,
4748 4749 4750 4751 4752 4753 4754 4755 4756 4757
					 io_list);

		list_del_init(&cache->io_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);
		btrfs_cleanup_bg_io(cache);
	}
}

4758
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4759
				   struct btrfs_fs_info *fs_info)
4760
{
4761 4762
	struct btrfs_device *dev, *tmp;

4763
	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4764 4765 4766
	ASSERT(list_empty(&cur_trans->dirty_bgs));
	ASSERT(list_empty(&cur_trans->io_bgs));

4767 4768 4769 4770 4771
	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&dev->post_commit_list);
	}

4772
	btrfs_destroy_delayed_refs(cur_trans, fs_info);
4773

4774
	cur_trans->state = TRANS_STATE_COMMIT_START;
4775
	wake_up(&fs_info->transaction_blocked_wait);
4776

4777
	cur_trans->state = TRANS_STATE_UNBLOCKED;
4778
	wake_up(&fs_info->transaction_wait);
4779

4780
	btrfs_destroy_delayed_inodes(fs_info);
4781

4782
	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4783
				     EXTENT_DIRTY);
4784
	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4785

4786 4787
	btrfs_free_redirty_list(cur_trans);

4788 4789
	cur_trans->state =TRANS_STATE_COMPLETED;
	wake_up(&cur_trans->commit_wait);
4790 4791
}

4792
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
L
liubo 已提交
4793 4794 4795
{
	struct btrfs_transaction *t;

4796
	mutex_lock(&fs_info->transaction_kthread_mutex);
L
liubo 已提交
4797

4798 4799 4800
	spin_lock(&fs_info->trans_lock);
	while (!list_empty(&fs_info->trans_list)) {
		t = list_first_entry(&fs_info->trans_list,
4801 4802
				     struct btrfs_transaction, list);
		if (t->state >= TRANS_STATE_COMMIT_START) {
4803
			refcount_inc(&t->use_count);
4804
			spin_unlock(&fs_info->trans_lock);
4805
			btrfs_wait_for_commit(fs_info, t->transid);
4806
			btrfs_put_transaction(t);
4807
			spin_lock(&fs_info->trans_lock);
4808 4809
			continue;
		}
4810
		if (t == fs_info->running_transaction) {
4811
			t->state = TRANS_STATE_COMMIT_DOING;
4812
			spin_unlock(&fs_info->trans_lock);
4813 4814 4815 4816 4817 4818 4819
			/*
			 * We wait for 0 num_writers since we don't hold a trans
			 * handle open currently for this transaction.
			 */
			wait_event(t->writer_wait,
				   atomic_read(&t->num_writers) == 0);
		} else {
4820
			spin_unlock(&fs_info->trans_lock);
4821
		}
4822
		btrfs_cleanup_one_transaction(t, fs_info);
4823

4824 4825 4826
		spin_lock(&fs_info->trans_lock);
		if (t == fs_info->running_transaction)
			fs_info->running_transaction = NULL;
L
liubo 已提交
4827
		list_del_init(&t->list);
4828
		spin_unlock(&fs_info->trans_lock);
L
liubo 已提交
4829

4830
		btrfs_put_transaction(t);
4831
		trace_btrfs_transaction_commit(fs_info->tree_root);
4832
		spin_lock(&fs_info->trans_lock);
4833
	}
4834 4835
	spin_unlock(&fs_info->trans_lock);
	btrfs_destroy_all_ordered_extents(fs_info);
4836 4837
	btrfs_destroy_delayed_inodes(fs_info);
	btrfs_assert_delayed_root_empty(fs_info);
4838
	btrfs_destroy_all_delalloc_inodes(fs_info);
4839
	btrfs_drop_all_logs(fs_info);
4840
	mutex_unlock(&fs_info->transaction_kthread_mutex);
L
liubo 已提交
4841 4842 4843

	return 0;
}
4844

4845
int btrfs_init_root_free_objectid(struct btrfs_root *root)
4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868
{
	struct btrfs_path *path;
	int ret;
	struct extent_buffer *l;
	struct btrfs_key search_key;
	struct btrfs_key found_key;
	int slot;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
	search_key.type = -1;
	search_key.offset = (u64)-1;
	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
	if (ret < 0)
		goto error;
	BUG_ON(ret == 0); /* Corruption */
	if (path->slots[0] > 0) {
		slot = path->slots[0] - 1;
		l = path->nodes[0];
		btrfs_item_key_to_cpu(l, &found_key, slot);
4869 4870
		root->free_objectid = max_t(u64, found_key.objectid + 1,
					    BTRFS_FIRST_FREE_OBJECTID);
4871
	} else {
4872
		root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4873 4874 4875 4876 4877 4878 4879
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

4880
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4881 4882 4883 4884
{
	int ret;
	mutex_lock(&root->objectid_mutex);

4885
	if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4886 4887 4888 4889 4890 4891 4892
		btrfs_warn(root->fs_info,
			   "the objectid of root %llu reaches its highest value",
			   root->root_key.objectid);
		ret = -ENOSPC;
		goto out;
	}

4893
	*objectid = root->free_objectid++;
4894 4895 4896 4897 4898
	ret = 0;
out:
	mutex_unlock(&root->objectid_mutex);
	return ret;
}