disk-io.c 126.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

C
Chris Mason 已提交
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
C
Chris Mason 已提交
9
#include <linux/writeback.h>
C
Chris Mason 已提交
10
#include <linux/buffer_head.h>
11
#include <linux/workqueue.h>
12
#include <linux/kthread.h>
13
#include <linux/slab.h>
14
#include <linux/migrate.h>
15
#include <linux/ratelimit.h>
16
#include <linux/uuid.h>
S
Stefan Behrens 已提交
17
#include <linux/semaphore.h>
18
#include <linux/error-injection.h>
19
#include <linux/crc32c.h>
20
#include <linux/sched/mm.h>
21
#include <asm/unaligned.h>
22
#include <crypto/hash.h>
23 24
#include "ctree.h"
#include "disk-io.h"
25
#include "transaction.h"
26
#include "btrfs_inode.h"
27
#include "volumes.h"
28
#include "print-tree.h"
29
#include "locking.h"
30
#include "tree-log.h"
31
#include "free-space-cache.h"
32
#include "free-space-tree.h"
33
#include "inode-map.h"
34
#include "check-integrity.h"
35
#include "rcu-string.h"
36
#include "dev-replace.h"
D
David Woodhouse 已提交
37
#include "raid56.h"
38
#include "sysfs.h"
J
Josef Bacik 已提交
39
#include "qgroup.h"
40
#include "compression.h"
41
#include "tree-checker.h"
J
Josef Bacik 已提交
42
#include "ref-verify.h"
43
#include "block-group.h"
44
#include "discard.h"
45
#include "space-info.h"
46

47 48 49 50
#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
				 BTRFS_HEADER_FLAG_RELOC |\
				 BTRFS_SUPER_FLAG_ERROR |\
				 BTRFS_SUPER_FLAG_SEEDING |\
51 52
				 BTRFS_SUPER_FLAG_METADUMP |\
				 BTRFS_SUPER_FLAG_METADUMP_V2)
53

54
static const struct extent_io_ops btree_extent_io_ops;
55
static void end_workqueue_fn(struct btrfs_work *work);
56
static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
L
liubo 已提交
57
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
58
				      struct btrfs_fs_info *fs_info);
59
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
60
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
L
liubo 已提交
61 62
					struct extent_io_tree *dirty_pages,
					int mark);
63
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
L
liubo 已提交
64
				       struct extent_io_tree *pinned_extents);
65 66
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
67

C
Chris Mason 已提交
68
/*
69 70
 * btrfs_end_io_wq structs are used to do processing in task context when an IO
 * is complete.  This is used during reads to verify checksums, and it is used
C
Chris Mason 已提交
71 72
 * by writes to insert metadata for new file extents after IO is complete.
 */
73
struct btrfs_end_io_wq {
74 75 76 77
	struct bio *bio;
	bio_end_io_t *end_io;
	void *private;
	struct btrfs_fs_info *info;
78
	blk_status_t status;
79
	enum btrfs_wq_endio_type metadata;
80
	struct btrfs_work work;
81
};
82

83 84 85 86 87 88 89
static struct kmem_cache *btrfs_end_io_wq_cache;

int __init btrfs_end_io_wq_init(void)
{
	btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
					sizeof(struct btrfs_end_io_wq),
					0,
90
					SLAB_MEM_SPREAD,
91 92 93 94 95 96
					NULL);
	if (!btrfs_end_io_wq_cache)
		return -ENOMEM;
	return 0;
}

97
void __cold btrfs_end_io_wq_exit(void)
98
{
99
	kmem_cache_destroy(btrfs_end_io_wq_cache);
100 101
}

102 103 104 105 106 107
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
	if (fs_info->csum_shash)
		crypto_free_shash(fs_info->csum_shash);
}

C
Chris Mason 已提交
108 109 110 111 112
/*
 * async submit bios are used to offload expensive checksumming
 * onto the worker threads.  They checksum file and metadata bios
 * just before they are sent down the IO stack.
 */
113
struct async_submit_bio {
114
	void *private_data;
115
	struct bio *bio;
116
	extent_submit_bio_start_t *submit_bio_start;
117
	int mirror_num;
118 119 120 121 122
	/*
	 * bio_offset is optional, can be used if the pages in the bio
	 * can't tell us where in the file the bio should go
	 */
	u64 bio_offset;
123
	struct btrfs_work work;
124
	blk_status_t status;
125 126
};

127 128 129 130 131 132 133 134
/*
 * Lockdep class keys for extent_buffer->lock's in this root.  For a given
 * eb, the lockdep key is determined by the btrfs_root it belongs to and
 * the level the eb occupies in the tree.
 *
 * Different roots are used for different purposes and may nest inside each
 * other and they require separate keysets.  As lockdep keys should be
 * static, assign keysets according to the purpose of the root as indicated
135 136
 * by btrfs_root->root_key.objectid.  This ensures that all special purpose
 * roots have separate keysets.
137
 *
138 139 140
 * Lock-nesting across peer nodes is always done with the immediate parent
 * node locked thus preventing deadlock.  As lockdep doesn't know this, use
 * subclass to avoid triggering lockdep warning in such cases.
141
 *
142 143 144
 * The key is set by the readpage_end_io_hook after the buffer has passed
 * csum validation but before the pages are unlocked.  It is also set by
 * btrfs_init_new_buffer on freshly allocated blocks.
145
 *
146 147 148
 * We also add a check to make sure the highest level of the tree is the
 * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
 * needs update as well.
149 150 151 152 153
 */
#ifdef CONFIG_DEBUG_LOCK_ALLOC
# if BTRFS_MAX_LEVEL != 8
#  error
# endif
154 155 156 157 158 159 160 161 162 163 164 165 166

static struct btrfs_lockdep_keyset {
	u64			id;		/* root objectid */
	const char		*name_stem;	/* lock name stem */
	char			names[BTRFS_MAX_LEVEL + 1][20];
	struct lock_class_key	keys[BTRFS_MAX_LEVEL + 1];
} btrfs_lockdep_keysets[] = {
	{ .id = BTRFS_ROOT_TREE_OBJECTID,	.name_stem = "root"	},
	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	.name_stem = "extent"	},
	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	.name_stem = "chunk"	},
	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
167
	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
168 169 170
	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
171
	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
172
	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	.name_stem = "free-space" },
173
	{ .id = 0,				.name_stem = "tree"	},
174
};
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205

void __init btrfs_init_lockdep(void)
{
	int i, j;

	/* initialize lockdep class names */
	for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
		struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];

		for (j = 0; j < ARRAY_SIZE(ks->names); j++)
			snprintf(ks->names[j], sizeof(ks->names[j]),
				 "btrfs-%s-%02d", ks->name_stem, j);
	}
}

void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
				    int level)
{
	struct btrfs_lockdep_keyset *ks;

	BUG_ON(level >= ARRAY_SIZE(ks->keys));

	/* find the matching keyset, id 0 is the default entry */
	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
		if (ks->id == objectid)
			break;

	lockdep_set_class_and_name(&eb->lock,
				   &ks->keys[level], ks->names[level]);
}

206 207
#endif

C
Chris Mason 已提交
208 209 210 211
/*
 * extents on the btree inode are pretty simple, there's one extent
 * that covers the entire device
 */
212
struct extent_map *btree_get_extent(struct btrfs_inode *inode,
213 214
				    struct page *page, size_t pg_offset,
				    u64 start, u64 len)
215
{
216
	struct extent_map_tree *em_tree = &inode->extent_tree;
217 218 219
	struct extent_map *em;
	int ret;

220
	read_lock(&em_tree->lock);
221
	em = lookup_extent_mapping(em_tree, start, len);
222
	if (em) {
223
		read_unlock(&em_tree->lock);
224
		goto out;
225
	}
226
	read_unlock(&em_tree->lock);
227

228
	em = alloc_extent_map();
229 230 231 232 233
	if (!em) {
		em = ERR_PTR(-ENOMEM);
		goto out;
	}
	em->start = 0;
234
	em->len = (u64)-1;
C
Chris Mason 已提交
235
	em->block_len = (u64)-1;
236
	em->block_start = 0;
237

238
	write_lock(&em_tree->lock);
J
Josef Bacik 已提交
239
	ret = add_extent_mapping(em_tree, em, 0);
240 241
	if (ret == -EEXIST) {
		free_extent_map(em);
242
		em = lookup_extent_mapping(em_tree, start, len);
243
		if (!em)
244
			em = ERR_PTR(-EIO);
245
	} else if (ret) {
246
		free_extent_map(em);
247
		em = ERR_PTR(ret);
248
	}
249
	write_unlock(&em_tree->lock);
250

251 252
out:
	return em;
253 254
}

C
Chris Mason 已提交
255
/*
256 257 258
 * Compute the csum of a btree block and store the result to provided buffer.
 *
 * Returns error if the extent buffer cannot be mapped.
C
Chris Mason 已提交
259
 */
260
static int csum_tree_block(struct extent_buffer *buf, u8 *result)
261
{
262 263
	struct btrfs_fs_info *fs_info = buf->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
264 265 266 267 268 269 270
	unsigned long len;
	unsigned long cur_len;
	unsigned long offset = BTRFS_CSUM_SIZE;
	char *kaddr;
	unsigned long map_start;
	unsigned long map_len;
	int err;
271 272 273

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
274 275

	len = buf->len - offset;
276

C
Chris Mason 已提交
277
	while (len > 0) {
278 279 280 281 282 283
		/*
		 * Note: we don't need to check for the err == 1 case here, as
		 * with the given combination of 'start = BTRFS_CSUM_SIZE (32)'
		 * and 'min_len = 32' and the currently implemented mapping
		 * algorithm we cannot cross a page boundary.
		 */
284
		err = map_private_extent_buffer(buf, offset, 32,
285
					&kaddr, &map_start, &map_len);
286
		if (WARN_ON(err))
287
			return err;
288
		cur_len = min(len, map_len - (offset - map_start));
289
		crypto_shash_update(shash, kaddr + offset - map_start, cur_len);
290 291 292
		len -= cur_len;
		offset += cur_len;
	}
293
	memset(result, 0, BTRFS_CSUM_SIZE);
294

295
	crypto_shash_final(shash, result);
296 297 298 299

	return 0;
}

C
Chris Mason 已提交
300 301 302 303 304 305
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
306
static int verify_parent_transid(struct extent_io_tree *io_tree,
307 308
				 struct extent_buffer *eb, u64 parent_transid,
				 int atomic)
309
{
310
	struct extent_state *cached_state = NULL;
311
	int ret;
312
	bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB);
313 314 315 316

	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 0;

317 318 319
	if (atomic)
		return -EAGAIN;

320 321
	if (need_lock) {
		btrfs_tree_read_lock(eb);
322
		btrfs_set_lock_blocking_read(eb);
323 324
	}

325
	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
326
			 &cached_state);
327
	if (extent_buffer_uptodate(eb) &&
328 329 330 331
	    btrfs_header_generation(eb) == parent_transid) {
		ret = 0;
		goto out;
	}
332 333 334
	btrfs_err_rl(eb->fs_info,
		"parent transid verify failed on %llu wanted %llu found %llu",
			eb->start,
335
			parent_transid, btrfs_header_generation(eb));
336
	ret = 1;
337 338 339 340

	/*
	 * Things reading via commit roots that don't have normal protection,
	 * like send, can have a really old block in cache that may point at a
341
	 * block that has been freed and re-allocated.  So don't clear uptodate
342 343 344 345 346 347
	 * if we find an eb that is under IO (dirty/writeback) because we could
	 * end up reading in the stale data and then writing it back out and
	 * making everybody very sad.
	 */
	if (!extent_buffer_under_io(eb))
		clear_extent_buffer_uptodate(eb);
C
Chris Mason 已提交
348
out:
349
	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
350
			     &cached_state);
351 352
	if (need_lock)
		btrfs_tree_read_unlock_blocking(eb);
353 354 355
	return ret;
}

356 357 358 359
static bool btrfs_supported_super_csum(u16 csum_type)
{
	switch (csum_type) {
	case BTRFS_CSUM_TYPE_CRC32:
360
	case BTRFS_CSUM_TYPE_XXHASH:
361
	case BTRFS_CSUM_TYPE_SHA256:
362
	case BTRFS_CSUM_TYPE_BLAKE2:
363 364 365 366 367 368
		return true;
	default:
		return false;
	}
}

D
David Sterba 已提交
369 370 371 372
/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
373 374
static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
				  char *raw_disk_sb)
D
David Sterba 已提交
375 376 377
{
	struct btrfs_super_block *disk_sb =
		(struct btrfs_super_block *)raw_disk_sb;
378
	char result[BTRFS_CSUM_SIZE];
379 380 381 382
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
D
David Sterba 已提交
383

384 385 386 387 388
	/*
	 * The super_block structure does not span the whole
	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
	 * filled with zeros and is included in the checksum.
	 */
389 390 391
	crypto_shash_update(shash, raw_disk_sb + BTRFS_CSUM_SIZE,
			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
	crypto_shash_final(shash, result);
D
David Sterba 已提交
392

393 394
	if (memcmp(disk_sb->csum, result, btrfs_super_csum_size(disk_sb)))
		return 1;
D
David Sterba 已提交
395

396
	return 0;
D
David Sterba 已提交
397 398
}

399
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
400
			   struct btrfs_key *first_key, u64 parent_transid)
401
{
402
	struct btrfs_fs_info *fs_info = eb->fs_info;
403 404 405 406 407 408
	int found_level;
	struct btrfs_key found_key;
	int ret;

	found_level = btrfs_header_level(eb);
	if (found_level != level) {
409 410
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree level check failed\n");
411 412 413 414 415 416 417 418 419
		btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
			  eb->start, level, found_level);
		return -EIO;
	}

	if (!first_key)
		return 0;

420 421 422 423 424 425 426 427
	/*
	 * For live tree block (new tree blocks in current transaction),
	 * we need proper lock context to avoid race, which is impossible here.
	 * So we only checks tree blocks which is read from disk, whose
	 * generation <= fs_info->last_trans_committed.
	 */
	if (btrfs_header_generation(eb) > fs_info->last_trans_committed)
		return 0;
428 429 430 431 432 433 434 435 436 437

	/* We have @first_key, so this @eb must have at least one item */
	if (btrfs_header_nritems(eb) == 0) {
		btrfs_err(fs_info,
		"invalid tree nritems, bytenr=%llu nritems=0 expect >0",
			  eb->start);
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
		return -EUCLEAN;
	}

438 439 440 441 442 443 444
	if (found_level)
		btrfs_node_key_to_cpu(eb, &found_key, 0);
	else
		btrfs_item_key_to_cpu(eb, &found_key, 0);
	ret = btrfs_comp_cpu_keys(first_key, &found_key);

	if (ret) {
445 446
		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
		     KERN_ERR "BTRFS: tree first key check failed\n");
447
		btrfs_err(fs_info,
448 449 450 451 452
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
			  eb->start, parent_transid, first_key->objectid,
			  first_key->type, first_key->offset,
			  found_key.objectid, found_key.type,
			  found_key.offset);
453 454 455 456
	}
	return ret;
}

C
Chris Mason 已提交
457 458 459
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
460 461 462 463
 *
 * @parent_transid:	expected transid, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key of first slot, skip check if NULL
C
Chris Mason 已提交
464
 */
465
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
466 467
					  u64 parent_transid, int level,
					  struct btrfs_key *first_key)
468
{
469
	struct btrfs_fs_info *fs_info = eb->fs_info;
470
	struct extent_io_tree *io_tree;
471
	int failed = 0;
472 473 474
	int ret;
	int num_copies = 0;
	int mirror_num = 0;
475
	int failed_mirror = 0;
476

477
	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
478
	while (1) {
479
		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
480
		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
481
		if (!ret) {
482
			if (verify_parent_transid(io_tree, eb,
483
						   parent_transid, 0))
484
				ret = -EIO;
485
			else if (btrfs_verify_level_key(eb, level,
486
						first_key, parent_transid))
487 488 489
				ret = -EUCLEAN;
			else
				break;
490
		}
C
Chris Mason 已提交
491

492
		num_copies = btrfs_num_copies(fs_info,
493
					      eb->start, eb->len);
C
Chris Mason 已提交
494
		if (num_copies == 1)
495
			break;
C
Chris Mason 已提交
496

497 498 499 500 501
		if (!failed_mirror) {
			failed = 1;
			failed_mirror = eb->read_mirror;
		}

502
		mirror_num++;
503 504 505
		if (mirror_num == failed_mirror)
			mirror_num++;

C
Chris Mason 已提交
506
		if (mirror_num > num_copies)
507
			break;
508
	}
509

510
	if (failed && !ret && failed_mirror)
511
		btrfs_repair_eb_io_failure(eb, failed_mirror);
512 513

	return ret;
514
}
515

C
Chris Mason 已提交
516
/*
C
Chris Mason 已提交
517 518
 * checksum a dirty tree block before IO.  This has extra checks to make sure
 * we only fill in the checksum field in the first page of a multi-page block
C
Chris Mason 已提交
519
 */
C
Chris Mason 已提交
520

521
static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page)
522
{
M
Miao Xie 已提交
523
	u64 start = page_offset(page);
524
	u64 found_start;
525 526
	u8 result[BTRFS_CSUM_SIZE];
	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
527
	struct extent_buffer *eb;
528
	int ret;
529

J
Josef Bacik 已提交
530 531 532
	eb = (struct extent_buffer *)page->private;
	if (page != eb->pages[0])
		return 0;
533

534
	found_start = btrfs_header_bytenr(eb);
535 536 537 538 539 540 541 542 543
	/*
	 * Please do not consolidate these warnings into a single if.
	 * It is useful to know what went wrong.
	 */
	if (WARN_ON(found_start != start))
		return -EUCLEAN;
	if (WARN_ON(!PageUptodate(page)))
		return -EUCLEAN;

544
	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
545 546
			btrfs_header_fsid(), BTRFS_FSID_SIZE) == 0);

547 548 549
	if (csum_tree_block(eb, result))
		return -EINVAL;

550 551 552 553 554 555
	if (btrfs_header_level(eb))
		ret = btrfs_check_node(eb);
	else
		ret = btrfs_check_leaf_full(eb);

	if (ret < 0) {
556
		btrfs_print_tree(eb, 0);
557 558 559
		btrfs_err(fs_info,
		"block=%llu write time tree block corruption detected",
			  eb->start);
560
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
561 562
		return ret;
	}
563
	write_extent_buffer(eb, result, 0, csum_size);
564

565
	return 0;
566 567
}

568
static int check_tree_block_fsid(struct extent_buffer *eb)
Y
Yan Zheng 已提交
569
{
570
	struct btrfs_fs_info *fs_info = eb->fs_info;
571
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
572
	u8 fsid[BTRFS_FSID_SIZE];
Y
Yan Zheng 已提交
573 574
	int ret = 1;

575
	read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
576
	while (fs_devices) {
577 578 579 580 581 582 583 584 585 586 587 588 589 590
		u8 *metadata_uuid;

		/*
		 * Checking the incompat flag is only valid for the current
		 * fs. For seed devices it's forbidden to have their uuid
		 * changed so reading ->fsid in this case is fine
		 */
		if (fs_devices == fs_info->fs_devices &&
		    btrfs_fs_incompat(fs_info, METADATA_UUID))
			metadata_uuid = fs_devices->metadata_uuid;
		else
			metadata_uuid = fs_devices->fsid;

		if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) {
Y
Yan Zheng 已提交
591 592 593 594 595 596 597 598
			ret = 0;
			break;
		}
		fs_devices = fs_devices->seed;
	}
	return ret;
}

599 600 601
static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
				      u64 phy_offset, struct page *page,
				      u64 start, u64 end, int mirror)
602 603 604 605 606
{
	u64 found_start;
	int found_level;
	struct extent_buffer *eb;
	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
607
	struct btrfs_fs_info *fs_info = root->fs_info;
608
	u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
609
	int ret = 0;
610
	u8 result[BTRFS_CSUM_SIZE];
611
	int reads_done;
612 613 614

	if (!page->private)
		goto out;
C
Chris Mason 已提交
615

J
Josef Bacik 已提交
616
	eb = (struct extent_buffer *)page->private;
C
Chris Mason 已提交
617

618 619 620
	/* the pending IO might have been the only thing that kept this buffer
	 * in memory.  Make sure we have a ref for all this other checks
	 */
D
David Sterba 已提交
621
	atomic_inc(&eb->refs);
622 623

	reads_done = atomic_dec_and_test(&eb->io_pages);
624 625
	if (!reads_done)
		goto err;
626

627
	eb->read_mirror = mirror;
628
	if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) {
629 630 631 632
		ret = -EIO;
		goto err;
	}

633
	found_start = btrfs_header_bytenr(eb);
634
	if (found_start != eb->start) {
635 636
		btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu",
			     eb->start, found_start);
637
		ret = -EIO;
638 639
		goto err;
	}
640
	if (check_tree_block_fsid(eb)) {
641 642
		btrfs_err_rl(fs_info, "bad fsid on block %llu",
			     eb->start);
643 644 645
		ret = -EIO;
		goto err;
	}
646
	found_level = btrfs_header_level(eb);
647
	if (found_level >= BTRFS_MAX_LEVEL) {
648 649
		btrfs_err(fs_info, "bad tree block level %d on %llu",
			  (int)btrfs_header_level(eb), eb->start);
650 651 652
		ret = -EIO;
		goto err;
	}
653

654 655
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
				       eb, found_level);
656

657
	ret = csum_tree_block(eb, result);
658
	if (ret)
659 660
		goto err;

661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
	if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
		u32 val;
		u32 found = 0;

		memcpy(&found, result, csum_size);

		read_extent_buffer(eb, &val, 0, csum_size);
		btrfs_warn_rl(fs_info,
		"%s checksum verify failed on %llu wanted %x found %x level %d",
			      fs_info->sb->s_id, eb->start,
			      val, found, btrfs_header_level(eb));
		ret = -EUCLEAN;
		goto err;
	}

676 677 678 679 680
	/*
	 * If this is a leaf block and it is corrupt, set the corrupt bit so
	 * that we don't try and read the other copies of this block, just
	 * return -EIO.
	 */
681
	if (found_level == 0 && btrfs_check_leaf_full(eb)) {
682 683 684
		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
		ret = -EIO;
	}
685

686
	if (found_level > 0 && btrfs_check_node(eb))
L
Liu Bo 已提交
687 688
		ret = -EIO;

689 690
	if (!ret)
		set_extent_buffer_uptodate(eb);
691 692 693 694
	else
		btrfs_err(fs_info,
			  "block=%llu read time tree block corruption detected",
			  eb->start);
695
err:
696 697
	if (reads_done &&
	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
698
		btree_readahead_hook(eb, ret);
A
Arne Jansen 已提交
699

D
David Woodhouse 已提交
700 701 702 703 704 705 706
	if (ret) {
		/*
		 * our io error hook is going to dec the io pages
		 * again, we have to make sure it has something
		 * to decrement
		 */
		atomic_inc(&eb->io_pages);
707
		clear_extent_buffer_uptodate(eb);
D
David Woodhouse 已提交
708
	}
709
	free_extent_buffer(eb);
710
out:
711
	return ret;
712 713
}

714
static void end_workqueue_bio(struct bio *bio)
715
{
716
	struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
717
	struct btrfs_fs_info *fs_info;
718
	struct btrfs_workqueue *wq;
719 720

	fs_info = end_io_wq->info;
721
	end_io_wq->status = bio->bi_status;
722

M
Mike Christie 已提交
723
	if (bio_op(bio) == REQ_OP_WRITE) {
724
		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
725
			wq = fs_info->endio_meta_write_workers;
726
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
727
			wq = fs_info->endio_freespace_worker;
728
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
729
			wq = fs_info->endio_raid56_workers;
730
		else
731
			wq = fs_info->endio_write_workers;
732
	} else {
733
		if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
734
			wq = fs_info->endio_repair_workers;
735
		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
736
			wq = fs_info->endio_raid56_workers;
737
		else if (end_io_wq->metadata)
738
			wq = fs_info->endio_meta_workers;
739
		else
740
			wq = fs_info->endio_workers;
741
	}
742

743
	btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
744
	btrfs_queue_work(wq, &end_io_wq->work);
745 746
}

747
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
748
			enum btrfs_wq_endio_type metadata)
749
{
750
	struct btrfs_end_io_wq *end_io_wq;
751

752
	end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS);
753
	if (!end_io_wq)
754
		return BLK_STS_RESOURCE;
755 756 757

	end_io_wq->private = bio->bi_private;
	end_io_wq->end_io = bio->bi_end_io;
758
	end_io_wq->info = info;
759
	end_io_wq->status = 0;
760
	end_io_wq->bio = bio;
761
	end_io_wq->metadata = metadata;
762 763 764

	bio->bi_private = end_io_wq;
	bio->bi_end_io = end_workqueue_bio;
765 766 767
	return 0;
}

C
Chris Mason 已提交
768 769 770
static void run_one_async_start(struct btrfs_work *work)
{
	struct async_submit_bio *async;
771
	blk_status_t ret;
C
Chris Mason 已提交
772 773

	async = container_of(work, struct  async_submit_bio, work);
774
	ret = async->submit_bio_start(async->private_data, async->bio,
775 776
				      async->bio_offset);
	if (ret)
777
		async->status = ret;
C
Chris Mason 已提交
778 779
}

780 781 782 783 784 785 786 787
/*
 * In order to insert checksums into the metadata in large chunks, we wait
 * until bio submission time.   All the pages in the bio are checksummed and
 * sums are attached onto the ordered extent record.
 *
 * At IO completion time the csums attached on the ordered extent record are
 * inserted into the tree.
 */
C
Chris Mason 已提交
788
static void run_one_async_done(struct btrfs_work *work)
789 790
{
	struct async_submit_bio *async;
791 792
	struct inode *inode;
	blk_status_t ret;
793 794

	async = container_of(work, struct  async_submit_bio, work);
795
	inode = async->private_data;
796

797
	/* If an error occurred we just want to clean up the bio and move on */
798 799
	if (async->status) {
		async->bio->bi_status = async->status;
800
		bio_endio(async->bio);
801 802 803
		return;
	}

804 805 806 807 808 809
	/*
	 * All of the bios that pass through here are from async helpers.
	 * Use REQ_CGROUP_PUNT to issue them from the owning cgroup's context.
	 * This changes nothing when cgroups aren't in use.
	 */
	async->bio->bi_opf |= REQ_CGROUP_PUNT;
810
	ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
811 812 813 814
	if (ret) {
		async->bio->bi_status = ret;
		bio_endio(async->bio);
	}
C
Chris Mason 已提交
815 816 817 818 819 820 821
}

static void run_one_async_free(struct btrfs_work *work)
{
	struct async_submit_bio *async;

	async = container_of(work, struct  async_submit_bio, work);
822 823 824
	kfree(async);
}

825 826 827
blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
				 int mirror_num, unsigned long bio_flags,
				 u64 bio_offset, void *private_data,
828
				 extent_submit_bio_start_t *submit_bio_start)
829 830 831 832 833
{
	struct async_submit_bio *async;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
834
		return BLK_STS_RESOURCE;
835

836
	async->private_data = private_data;
837 838
	async->bio = bio;
	async->mirror_num = mirror_num;
C
Chris Mason 已提交
839 840
	async->submit_bio_start = submit_bio_start;

841 842
	btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
			run_one_async_free);
C
Chris Mason 已提交
843

844
	async->bio_offset = bio_offset;
845

846
	async->status = 0;
847

848
	if (op_is_sync(bio->bi_opf))
849
		btrfs_set_work_high_priority(&async->work);
850

851
	btrfs_queue_work(fs_info->workers, &async->work);
852 853 854
	return 0;
}

855
static blk_status_t btree_csum_one_bio(struct bio *bio)
856
{
857
	struct bio_vec *bvec;
858
	struct btrfs_root *root;
859
	int ret = 0;
860
	struct bvec_iter_all iter_all;
861

862
	ASSERT(!bio_flagged(bio, BIO_CLONED));
863
	bio_for_each_segment_all(bvec, bio, iter_all) {
864
		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
865
		ret = csum_dirty_buffer(root->fs_info, bvec->bv_page);
866 867
		if (ret)
			break;
868
	}
869

870
	return errno_to_blk_status(ret);
871 872
}

873
static blk_status_t btree_submit_bio_start(void *private_data, struct bio *bio,
874
					     u64 bio_offset)
875
{
876 877
	/*
	 * when we're called for a write, we're already in the async
878
	 * submission context.  Just jump into btrfs_map_bio
879
	 */
880
	return btree_csum_one_bio(bio);
C
Chris Mason 已提交
881
}
882

883 884
static int check_async_write(struct btrfs_fs_info *fs_info,
			     struct btrfs_inode *bi)
885
{
886 887
	if (atomic_read(&bi->sync_writers))
		return 0;
888
	if (test_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags))
889 890 891 892
		return 0;
	return 1;
}

893
static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio,
894 895
					  int mirror_num,
					  unsigned long bio_flags)
896
{
897
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
898
	int async = check_async_write(fs_info, BTRFS_I(inode));
899
	blk_status_t ret;
900

M
Mike Christie 已提交
901
	if (bio_op(bio) != REQ_OP_WRITE) {
C
Chris Mason 已提交
902 903 904 905
		/*
		 * called for a read, do the setup so that checksum validation
		 * can happen in the async kernel threads
		 */
906 907
		ret = btrfs_bio_wq_end_io(fs_info, bio,
					  BTRFS_WQ_ENDIO_METADATA);
908
		if (ret)
909
			goto out_w_error;
910
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
911 912 913
	} else if (!async) {
		ret = btree_csum_one_bio(bio);
		if (ret)
914
			goto out_w_error;
915
		ret = btrfs_map_bio(fs_info, bio, mirror_num);
916 917 918 919 920
	} else {
		/*
		 * kthread helpers are used to submit writes so that
		 * checksumming can happen in parallel across all CPUs
		 */
921
		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, 0,
922
					  0, inode, btree_submit_bio_start);
923
	}
924

925 926 927 928
	if (ret)
		goto out_w_error;
	return 0;

929
out_w_error:
930
	bio->bi_status = ret;
931
	bio_endio(bio);
932
	return ret;
933 934
}

J
Jan Beulich 已提交
935
#ifdef CONFIG_MIGRATION
936
static int btree_migratepage(struct address_space *mapping,
937 938
			struct page *newpage, struct page *page,
			enum migrate_mode mode)
939 940 941 942 943 944 945 946 947 948 949 950 951 952
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
	if (PageDirty(page))
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
	if (page_has_private(page) &&
	    !try_to_release_page(page, GFP_KERNEL))
		return -EAGAIN;
953
	return migrate_page(mapping, newpage, page, mode);
954
}
J
Jan Beulich 已提交
955
#endif
956

957 958 959 960

static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
961 962 963
	struct btrfs_fs_info *fs_info;
	int ret;

964
	if (wbc->sync_mode == WB_SYNC_NONE) {
965 966 967 968

		if (wbc->for_kupdate)
			return 0;

969
		fs_info = BTRFS_I(mapping->host)->root->fs_info;
970
		/* this is a bit racy, but that's ok */
971 972 973
		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
					     BTRFS_DIRTY_METADATA_THRESH,
					     fs_info->dirty_metadata_batch);
974
		if (ret < 0)
975 976
			return 0;
	}
977
	return btree_write_cache_pages(mapping, wbc);
978 979
}

980
static int btree_readpage(struct file *file, struct page *page)
981
{
982
	return extent_read_full_page(page, btree_get_extent, 0);
983
}
C
Chris Mason 已提交
984

985
static int btree_releasepage(struct page *page, gfp_t gfp_flags)
986
{
987
	if (PageWriteback(page) || PageDirty(page))
C
Chris Mason 已提交
988
		return 0;
989

990
	return try_release_extent_buffer(page);
991 992
}

993 994
static void btree_invalidatepage(struct page *page, unsigned int offset,
				 unsigned int length)
995
{
996 997
	struct extent_io_tree *tree;
	tree = &BTRFS_I(page->mapping->host)->io_tree;
998 999
	extent_invalidatepage(tree, page, offset);
	btree_releasepage(page, GFP_NOFS);
1000
	if (PagePrivate(page)) {
1001 1002 1003
		btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info,
			   "page private not zero on page %llu",
			   (unsigned long long)page_offset(page));
1004 1005
		ClearPagePrivate(page);
		set_page_private(page, 0);
1006
		put_page(page);
1007
	}
1008 1009
}

1010 1011
static int btree_set_page_dirty(struct page *page)
{
1012
#ifdef DEBUG
1013 1014 1015 1016 1017 1018 1019 1020
	struct extent_buffer *eb;

	BUG_ON(!PagePrivate(page));
	eb = (struct extent_buffer *)page->private;
	BUG_ON(!eb);
	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
	BUG_ON(!atomic_read(&eb->refs));
	btrfs_assert_tree_locked(eb);
1021
#endif
1022 1023 1024
	return __set_page_dirty_nobuffers(page);
}

1025
static const struct address_space_operations btree_aops = {
1026
	.readpage	= btree_readpage,
1027
	.writepages	= btree_writepages,
1028 1029
	.releasepage	= btree_releasepage,
	.invalidatepage = btree_invalidatepage,
1030
#ifdef CONFIG_MIGRATION
1031
	.migratepage	= btree_migratepage,
1032
#endif
1033
	.set_page_dirty = btree_set_page_dirty,
1034 1035
};

1036
void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr)
C
Chris Mason 已提交
1037
{
1038
	struct extent_buffer *buf = NULL;
1039
	int ret;
C
Chris Mason 已提交
1040

1041
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
1042
	if (IS_ERR(buf))
1043
		return;
1044

1045
	ret = read_extent_buffer_pages(buf, WAIT_NONE, 0);
1046 1047 1048 1049
	if (ret < 0)
		free_extent_buffer_stale(buf);
	else
		free_extent_buffer(buf);
C
Chris Mason 已提交
1050 1051
}

1052 1053 1054
struct extent_buffer *btrfs_find_create_tree_block(
						struct btrfs_fs_info *fs_info,
						u64 bytenr)
1055
{
1056 1057 1058
	if (btrfs_is_testing(fs_info))
		return alloc_test_extent_buffer(fs_info, bytenr);
	return alloc_extent_buffer(fs_info, bytenr);
1059 1060
}

1061 1062 1063 1064 1065 1066 1067 1068
/*
 * Read tree block at logical address @bytenr and do variant basic but critical
 * verification.
 *
 * @parent_transid:	expected transid of this tree block, skip check if 0
 * @level:		expected level, mandatory check
 * @first_key:		expected key in slot 0, skip check if NULL
 */
1069
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
1070 1071
				      u64 parent_transid, int level,
				      struct btrfs_key *first_key)
1072 1073 1074 1075
{
	struct extent_buffer *buf = NULL;
	int ret;

1076
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
1077 1078
	if (IS_ERR(buf))
		return buf;
1079

1080
	ret = btree_read_extent_buffer_pages(buf, parent_transid,
1081
					     level, first_key);
1082
	if (ret) {
1083
		free_extent_buffer_stale(buf);
1084
		return ERR_PTR(ret);
1085
	}
1086
	return buf;
1087

1088 1089
}

1090
void btrfs_clean_tree_block(struct extent_buffer *buf)
1091
{
1092
	struct btrfs_fs_info *fs_info = buf->fs_info;
1093
	if (btrfs_header_generation(buf) ==
1094
	    fs_info->running_transaction->transid) {
1095
		btrfs_assert_tree_locked(buf);
1096

1097
		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
1098 1099 1100
			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
						 -buf->len,
						 fs_info->dirty_metadata_batch);
1101
			/* ugh, clear_extent_buffer_dirty needs to lock the page */
1102
			btrfs_set_lock_blocking_write(buf);
1103 1104
			clear_extent_buffer_dirty(buf);
		}
1105
	}
1106 1107
}

1108 1109 1110 1111 1112 1113 1114 1115 1116
static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
{
	struct btrfs_subvolume_writers *writers;
	int ret;

	writers = kmalloc(sizeof(*writers), GFP_NOFS);
	if (!writers)
		return ERR_PTR(-ENOMEM);

1117
	ret = percpu_counter_init(&writers->counter, 0, GFP_NOFS);
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133
	if (ret < 0) {
		kfree(writers);
		return ERR_PTR(ret);
	}

	init_waitqueue_head(&writers->wait);
	return writers;
}

static void
btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers)
{
	percpu_counter_destroy(&writers->counter);
	kfree(writers);
}

1134
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
1135
			 u64 objectid)
1136
{
1137
	bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
1138
	root->fs_info = fs_info;
C
Chris Mason 已提交
1139
	root->node = NULL;
1140
	root->commit_root = NULL;
1141
	root->state = 0;
1142
	root->orphan_cleanup_state = 0;
1143

1144
	root->last_trans = 0;
1145
	root->highest_objectid = 0;
1146
	root->nr_delalloc_inodes = 0;
1147
	root->nr_ordered_extents = 0;
1148
	root->inode_tree = RB_ROOT;
1149
	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
1150
	root->block_rsv = NULL;
1151 1152

	INIT_LIST_HEAD(&root->dirty_list);
1153
	INIT_LIST_HEAD(&root->root_list);
1154 1155
	INIT_LIST_HEAD(&root->delalloc_inodes);
	INIT_LIST_HEAD(&root->delalloc_root);
1156 1157
	INIT_LIST_HEAD(&root->ordered_extents);
	INIT_LIST_HEAD(&root->ordered_root);
1158
	INIT_LIST_HEAD(&root->reloc_dirty_list);
1159 1160
	INIT_LIST_HEAD(&root->logged_list[0]);
	INIT_LIST_HEAD(&root->logged_list[1]);
1161
	spin_lock_init(&root->inode_lock);
1162
	spin_lock_init(&root->delalloc_lock);
1163
	spin_lock_init(&root->ordered_extent_lock);
1164
	spin_lock_init(&root->accounting_lock);
1165 1166
	spin_lock_init(&root->log_extents_lock[0]);
	spin_lock_init(&root->log_extents_lock[1]);
1167
	spin_lock_init(&root->qgroup_meta_rsv_lock);
1168
	mutex_init(&root->objectid_mutex);
1169
	mutex_init(&root->log_mutex);
1170
	mutex_init(&root->ordered_extent_mutex);
1171
	mutex_init(&root->delalloc_mutex);
Y
Yan Zheng 已提交
1172 1173 1174
	init_waitqueue_head(&root->log_writer_wait);
	init_waitqueue_head(&root->log_commit_wait[0]);
	init_waitqueue_head(&root->log_commit_wait[1]);
1175 1176
	INIT_LIST_HEAD(&root->log_ctxs[0]);
	INIT_LIST_HEAD(&root->log_ctxs[1]);
Y
Yan Zheng 已提交
1177 1178 1179
	atomic_set(&root->log_commit[0], 0);
	atomic_set(&root->log_commit[1], 0);
	atomic_set(&root->log_writers, 0);
M
Miao Xie 已提交
1180
	atomic_set(&root->log_batch, 0);
1181
	refcount_set(&root->refs, 1);
1182
	atomic_set(&root->will_be_snapshotted, 0);
1183
	atomic_set(&root->snapshot_force_cow, 0);
1184
	atomic_set(&root->nr_swapfiles, 0);
Y
Yan Zheng 已提交
1185
	root->log_transid = 0;
1186
	root->log_transid_committed = -1;
1187
	root->last_log_commit = 0;
1188
	if (!dummy)
1189 1190
		extent_io_tree_init(fs_info, &root->dirty_log_pages,
				    IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
C
Chris Mason 已提交
1191

1192 1193
	memset(&root->root_key, 0, sizeof(root->root_key));
	memset(&root->root_item, 0, sizeof(root->root_item));
1194
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
1195
	if (!dummy)
1196 1197 1198
		root->defrag_trans_start = fs_info->generation;
	else
		root->defrag_trans_start = 0;
1199
	root->root_key.objectid = objectid;
1200
	root->anon_dev = 0;
1201

1202
	spin_lock_init(&root->root_item_lock);
1203
	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
J
Josef Bacik 已提交
1204 1205 1206 1207 1208 1209
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&root->leak_list);
	spin_lock(&fs_info->fs_roots_radix_lock);
	list_add_tail(&root->leak_list, &fs_info->allocated_roots);
	spin_unlock(&fs_info->fs_roots_radix_lock);
#endif
1210 1211
}

1212
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
1213
					   u64 objectid, gfp_t flags)
A
Al Viro 已提交
1214
{
1215
	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
A
Al Viro 已提交
1216
	if (root)
1217
		__setup_root(root, fs_info, objectid);
A
Al Viro 已提交
1218 1219 1220
	return root;
}

1221 1222
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
1223
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
1224 1225 1226
{
	struct btrfs_root *root;

1227 1228 1229
	if (!fs_info)
		return ERR_PTR(-EINVAL);

1230
	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
1231 1232
	if (!root)
		return ERR_PTR(-ENOMEM);
1233

1234
	/* We don't use the stripesize in selftest, set it as sectorsize */
1235
	root->alloc_bytenr = 0;
1236 1237 1238 1239 1240

	return root;
}
#endif

1241 1242 1243
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
				     u64 objectid)
{
1244
	struct btrfs_fs_info *fs_info = trans->fs_info;
1245 1246 1247 1248
	struct extent_buffer *leaf;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root;
	struct btrfs_key key;
1249
	unsigned int nofs_flag;
1250
	int ret = 0;
1251
	uuid_le uuid = NULL_UUID_LE;
1252

1253 1254 1255 1256 1257
	/*
	 * We're holding a transaction handle, so use a NOFS memory allocation
	 * context to avoid deadlock if reclaim happens.
	 */
	nofs_flag = memalloc_nofs_save();
1258
	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
1259
	memalloc_nofs_restore(nofs_flag);
1260 1261 1262 1263 1264 1265 1266
	if (!root)
		return ERR_PTR(-ENOMEM);

	root->root_key.objectid = objectid;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = 0;

1267
	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
1268 1269
	if (IS_ERR(leaf)) {
		ret = PTR_ERR(leaf);
1270
		leaf = NULL;
1271 1272 1273 1274 1275 1276 1277
		goto fail;
	}

	root->node = leaf;
	btrfs_mark_buffer_dirty(leaf);

	root->commit_root = btrfs_root_node(root);
1278
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288

	root->root_item.flags = 0;
	root->root_item.byte_limit = 0;
	btrfs_set_root_bytenr(&root->root_item, leaf->start);
	btrfs_set_root_generation(&root->root_item, trans->transid);
	btrfs_set_root_level(&root->root_item, 0);
	btrfs_set_root_refs(&root->root_item, 1);
	btrfs_set_root_used(&root->root_item, leaf->len);
	btrfs_set_root_last_snapshot(&root->root_item, 0);
	btrfs_set_root_dirid(&root->root_item, 0);
1289 1290
	if (is_fstree(objectid))
		uuid_le_gen(&uuid);
1291
	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302
	root->root_item.drop_level = 0;

	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
	if (ret)
		goto fail;

	btrfs_tree_unlock(leaf);

1303 1304
	return root;

1305
fail:
1306 1307
	if (leaf) {
		btrfs_tree_unlock(leaf);
1308
		free_extent_buffer(root->commit_root);
1309 1310
		free_extent_buffer(leaf);
	}
1311
	btrfs_put_root(root);
1312

1313
	return ERR_PTR(ret);
1314 1315
}

Y
Yan Zheng 已提交
1316 1317
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
					 struct btrfs_fs_info *fs_info)
1318 1319
{
	struct btrfs_root *root;
Y
Yan Zheng 已提交
1320
	struct extent_buffer *leaf;
1321

1322
	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
1323
	if (!root)
Y
Yan Zheng 已提交
1324
		return ERR_PTR(-ENOMEM);
1325 1326 1327 1328

	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
1329

Y
Yan Zheng 已提交
1330
	/*
1331 1332
	 * DON'T set REF_COWS for log trees
	 *
Y
Yan Zheng 已提交
1333 1334 1335 1336 1337
	 * log trees do not get reference counted because they go away
	 * before a real commit is actually done.  They do store pointers
	 * to file data extents, and those reference counts still get
	 * updated (along with back refs to the log tree).
	 */
1338

1339 1340
	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
			NULL, 0, 0, 0);
Y
Yan Zheng 已提交
1341
	if (IS_ERR(leaf)) {
1342
		btrfs_put_root(root);
Y
Yan Zheng 已提交
1343 1344
		return ERR_CAST(leaf);
	}
1345

Y
Yan Zheng 已提交
1346
	root->node = leaf;
1347 1348 1349

	btrfs_mark_buffer_dirty(root->node);
	btrfs_tree_unlock(root->node);
Y
Yan Zheng 已提交
1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
	return root;
}

int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *log_root;

	log_root = alloc_log_tree(trans, fs_info);
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);
	WARN_ON(fs_info->log_root_tree);
	fs_info->log_root_tree = log_root;
	return 0;
}

int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root)
{
1369
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
1370 1371 1372
	struct btrfs_root *log_root;
	struct btrfs_inode_item *inode_item;

1373
	log_root = alloc_log_tree(trans, fs_info);
Y
Yan Zheng 已提交
1374 1375 1376 1377 1378 1379 1380
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);

	log_root->last_trans = trans->transid;
	log_root->root_key.offset = root->root_key.objectid;

	inode_item = &log_root->root_item.inode;
1381 1382 1383
	btrfs_set_stack_inode_generation(inode_item, 1);
	btrfs_set_stack_inode_size(inode_item, 3);
	btrfs_set_stack_inode_nlink(inode_item, 1);
1384
	btrfs_set_stack_inode_nbytes(inode_item,
1385
				     fs_info->nodesize);
1386
	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
Y
Yan Zheng 已提交
1387

1388
	btrfs_set_root_node(&log_root->root_item, log_root->node);
Y
Yan Zheng 已提交
1389 1390 1391 1392

	WARN_ON(root->log_root);
	root->log_root = log_root;
	root->log_transid = 0;
1393
	root->log_transid_committed = -1;
1394
	root->last_log_commit = 0;
1395 1396 1397
	return 0;
}

1398 1399
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
					struct btrfs_key *key)
1400 1401 1402
{
	struct btrfs_root *root;
	struct btrfs_fs_info *fs_info = tree_root->fs_info;
1403
	struct btrfs_path *path;
1404
	u64 generation;
1405
	int ret;
1406
	int level;
1407

1408 1409
	path = btrfs_alloc_path();
	if (!path)
1410
		return ERR_PTR(-ENOMEM);
1411

1412
	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1413 1414 1415
	if (!root) {
		ret = -ENOMEM;
		goto alloc_fail;
1416 1417
	}

1418 1419
	ret = btrfs_find_root(tree_root, key, path,
			      &root->root_item, &root->root_key);
1420
	if (ret) {
1421 1422
		if (ret > 0)
			ret = -ENOENT;
1423
		goto find_fail;
1424
	}
1425

1426
	generation = btrfs_root_generation(&root->root_item);
1427
	level = btrfs_root_level(&root->root_item);
1428 1429
	root->node = read_tree_block(fs_info,
				     btrfs_root_bytenr(&root->root_item),
1430
				     generation, level, NULL);
1431 1432
	if (IS_ERR(root->node)) {
		ret = PTR_ERR(root->node);
1433 1434 1435
		goto find_fail;
	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
		ret = -EIO;
1436 1437
		free_extent_buffer(root->node);
		goto find_fail;
1438
	}
1439
	root->commit_root = btrfs_root_node(root);
1440
out:
1441 1442 1443 1444
	btrfs_free_path(path);
	return root;

find_fail:
1445
	btrfs_put_root(root);
1446 1447 1448 1449 1450
alloc_fail:
	root = ERR_PTR(ret);
	goto out;
}

1451
static int btrfs_init_fs_root(struct btrfs_root *root)
1452 1453
{
	int ret;
1454
	struct btrfs_subvolume_writers *writers;
1455 1456 1457 1458 1459 1460 1461 1462 1463

	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
					GFP_NOFS);
	if (!root->free_ino_pinned || !root->free_ino_ctl) {
		ret = -ENOMEM;
		goto fail;
	}

1464 1465 1466 1467 1468 1469 1470
	writers = btrfs_alloc_subvolume_writers();
	if (IS_ERR(writers)) {
		ret = PTR_ERR(writers);
		goto fail;
	}
	root->subv_writers = writers;

1471 1472 1473 1474 1475
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
		set_bit(BTRFS_ROOT_REF_COWS, &root->state);
		btrfs_check_and_init_root_item(&root->root_item);
	}

1476
	btrfs_init_free_ino_ctl(root);
1477 1478
	spin_lock_init(&root->ino_cache_lock);
	init_waitqueue_head(&root->ino_cache_wait);
1479 1480 1481

	ret = get_anon_bdev(&root->anon_dev);
	if (ret)
L
Liu Bo 已提交
1482
		goto fail;
1483 1484 1485 1486 1487 1488

	mutex_lock(&root->objectid_mutex);
	ret = btrfs_find_highest_objectid(root,
					&root->highest_objectid);
	if (ret) {
		mutex_unlock(&root->objectid_mutex);
L
Liu Bo 已提交
1489
		goto fail;
1490 1491 1492 1493 1494 1495
	}

	ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);

	mutex_unlock(&root->objectid_mutex);

1496 1497
	return 0;
fail:
D
David Sterba 已提交
1498
	/* The caller is responsible to call btrfs_free_fs_root */
1499 1500 1501
	return ret;
}

1502 1503
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
					       u64 root_id)
1504 1505 1506 1507 1508 1509
{
	struct btrfs_root *root;

	spin_lock(&fs_info->fs_roots_radix_lock);
	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				 (unsigned long)root_id);
1510
	if (root)
1511
		root = btrfs_grab_root(root);
1512 1513 1514 1515 1516 1517 1518 1519 1520
	spin_unlock(&fs_info->fs_roots_radix_lock);
	return root;
}

int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
			 struct btrfs_root *root)
{
	int ret;

1521
	ret = radix_tree_preload(GFP_NOFS);
1522 1523 1524 1525 1526 1527 1528
	if (ret)
		return ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	ret = radix_tree_insert(&fs_info->fs_roots_radix,
				(unsigned long)root->root_key.objectid,
				root);
1529
	if (ret == 0) {
1530
		btrfs_grab_root(root);
1531
		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1532
	}
1533 1534 1535 1536 1537 1538
	spin_unlock(&fs_info->fs_roots_radix_lock);
	radix_tree_preload_end();

	return ret;
}

J
Josef Bacik 已提交
1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
	struct btrfs_root *root;

	while (!list_empty(&fs_info->allocated_roots)) {
		root = list_first_entry(&fs_info->allocated_roots,
					struct btrfs_root, leak_list);
		btrfs_err(fs_info, "leaked root %llu-%llu refcount %d",
			  root->root_key.objectid, root->root_key.offset,
			  refcount_read(&root->refs));
		while (refcount_read(&root->refs) > 1)
1551 1552
			btrfs_put_root(root);
		btrfs_put_root(root);
J
Josef Bacik 已提交
1553 1554 1555 1556
	}
#endif
}

1557 1558
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
1559 1560 1561 1562 1563 1564 1565
	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
	percpu_counter_destroy(&fs_info->delalloc_bytes);
	percpu_counter_destroy(&fs_info->dio_bytes);
	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
	btrfs_free_csum_hash(fs_info);
	btrfs_free_stripe_hash_table(fs_info);
	btrfs_free_ref_cache(fs_info);
1566 1567
	kfree(fs_info->balance_ctl);
	kfree(fs_info->delayed_root);
1568 1569 1570 1571 1572 1573 1574 1575 1576
	btrfs_put_root(fs_info->extent_root);
	btrfs_put_root(fs_info->tree_root);
	btrfs_put_root(fs_info->chunk_root);
	btrfs_put_root(fs_info->dev_root);
	btrfs_put_root(fs_info->csum_root);
	btrfs_put_root(fs_info->quota_root);
	btrfs_put_root(fs_info->uuid_root);
	btrfs_put_root(fs_info->free_space_root);
	btrfs_put_root(fs_info->fs_root);
J
Josef Bacik 已提交
1577
	btrfs_check_leaked_roots(fs_info);
1578 1579 1580 1581 1582 1583
	kfree(fs_info->super_copy);
	kfree(fs_info->super_for_commit);
	kvfree(fs_info);
}


1584 1585 1586
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
				     struct btrfs_key *location,
				     bool check_ref)
1587 1588
{
	struct btrfs_root *root;
1589
	struct btrfs_path *path;
1590
	struct btrfs_key key;
1591 1592
	int ret;

1593
	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
1594
		return btrfs_grab_root(fs_info->tree_root);
1595
	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
1596
		return btrfs_grab_root(fs_info->extent_root);
1597
	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
1598
		return btrfs_grab_root(fs_info->chunk_root);
1599
	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
1600
		return btrfs_grab_root(fs_info->dev_root);
1601
	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
1602
		return btrfs_grab_root(fs_info->csum_root);
1603
	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
1604
		return btrfs_grab_root(fs_info->quota_root) ?
1605
			fs_info->quota_root : ERR_PTR(-ENOENT);
1606
	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
1607
		return btrfs_grab_root(fs_info->uuid_root) ?
1608
			fs_info->uuid_root : ERR_PTR(-ENOENT);
1609
	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
1610
		return btrfs_grab_root(fs_info->free_space_root) ?
1611
			fs_info->free_space_root : ERR_PTR(-ENOENT);
1612
again:
1613
	root = btrfs_lookup_fs_root(fs_info, location->objectid);
1614
	if (root) {
1615
		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1616
			btrfs_put_root(root);
1617
			return ERR_PTR(-ENOENT);
1618
		}
1619
		return root;
1620
	}
1621

1622
	root = btrfs_read_tree_root(fs_info->tree_root, location);
1623 1624
	if (IS_ERR(root))
		return root;
1625

1626
	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1627
		ret = -ENOENT;
1628
		goto fail;
1629
	}
1630

1631
	ret = btrfs_init_fs_root(root);
1632 1633
	if (ret)
		goto fail;
1634

1635 1636 1637 1638 1639
	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto fail;
	}
1640 1641 1642 1643 1644
	key.objectid = BTRFS_ORPHAN_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
	key.offset = location->objectid;

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1645
	btrfs_free_path(path);
1646 1647 1648
	if (ret < 0)
		goto fail;
	if (ret == 0)
1649
		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1650

1651 1652 1653 1654 1655 1656 1657 1658
	/*
	 * All roots have two refs on them at all times, one for the mounted fs,
	 * and one for being in the radix tree.  This way we only free the root
	 * when we are unmounting or deleting the subvolume.  We get one ref
	 * from __setup_root, one for inserting it into the radix tree, and then
	 * we have the third for returning it, and the caller will put it when
	 * it's done with the root.
	 */
1659
	btrfs_grab_root(root);
1660
	ret = btrfs_insert_fs_root(fs_info, root);
1661
	if (ret) {
1662
		btrfs_put_root(root);
1663
		if (ret == -EEXIST) {
D
David Sterba 已提交
1664
			btrfs_free_fs_root(root);
1665 1666 1667
			goto again;
		}
		goto fail;
1668
	}
1669
	return root;
1670
fail:
D
David Sterba 已提交
1671
	btrfs_free_fs_root(root);
1672
	return ERR_PTR(ret);
1673 1674
}

C
Chris Mason 已提交
1675 1676 1677 1678 1679 1680
static int btrfs_congested_fn(void *congested_data, int bdi_bits)
{
	struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
	int ret = 0;
	struct btrfs_device *device;
	struct backing_dev_info *bdi;
C
Chris Mason 已提交
1681

1682 1683
	rcu_read_lock();
	list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1684 1685
		if (!device->bdev)
			continue;
1686
		bdi = device->bdev->bd_bdi;
1687
		if (bdi_congested(bdi, bdi_bits)) {
C
Chris Mason 已提交
1688 1689 1690 1691
			ret = 1;
			break;
		}
	}
1692
	rcu_read_unlock();
C
Chris Mason 已提交
1693 1694 1695
	return ret;
}

1696 1697 1698 1699 1700
/*
 * called by the kthread helper functions to finally call the bio end_io
 * functions.  This is where read checksum verification actually happens
 */
static void end_workqueue_fn(struct btrfs_work *work)
1701 1702
{
	struct bio *bio;
1703
	struct btrfs_end_io_wq *end_io_wq;
1704

1705
	end_io_wq = container_of(work, struct btrfs_end_io_wq, work);
1706
	bio = end_io_wq->bio;
1707

1708
	bio->bi_status = end_io_wq->status;
1709 1710
	bio->bi_private = end_io_wq->private;
	bio->bi_end_io = end_io_wq->end_io;
1711
	bio_endio(bio);
1712
	kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
1713 1714
}

1715 1716 1717
static int cleaner_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1718
	struct btrfs_fs_info *fs_info = root->fs_info;
1719
	int again;
1720

1721
	while (1) {
1722
		again = 0;
1723

1724 1725
		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);

1726
		/* Make the cleaner go to sleep early. */
1727
		if (btrfs_need_cleaner_sleep(fs_info))
1728 1729
			goto sleep;

1730 1731 1732 1733
		/*
		 * Do not do anything if we might cause open_ctree() to block
		 * before we have finished mounting the filesystem.
		 */
1734
		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1735 1736
			goto sleep;

1737
		if (!mutex_trylock(&fs_info->cleaner_mutex))
1738 1739
			goto sleep;

1740 1741 1742 1743
		/*
		 * Avoid the problem that we change the status of the fs
		 * during the above check and trylock.
		 */
1744
		if (btrfs_need_cleaner_sleep(fs_info)) {
1745
			mutex_unlock(&fs_info->cleaner_mutex);
1746
			goto sleep;
1747
		}
1748

1749
		btrfs_run_delayed_iputs(fs_info);
1750

1751
		again = btrfs_clean_one_deleted_snapshot(root);
1752
		mutex_unlock(&fs_info->cleaner_mutex);
1753 1754

		/*
1755 1756
		 * The defragger has dealt with the R/O remount and umount,
		 * needn't do anything special here.
1757
		 */
1758
		btrfs_run_defrag_inodes(fs_info);
1759 1760 1761 1762 1763 1764 1765 1766 1767

		/*
		 * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
		 * with relocation (btrfs_relocate_chunk) and relocation
		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
		 * after acquiring fs_info->delete_unused_bgs_mutex. So we
		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
		 * unused block groups.
		 */
1768
		btrfs_delete_unused_bgs(fs_info);
1769
sleep:
1770
		clear_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1771 1772 1773 1774
		if (kthread_should_park())
			kthread_parkme();
		if (kthread_should_stop())
			return 0;
1775
		if (!again) {
1776
			set_current_state(TASK_INTERRUPTIBLE);
1777
			schedule();
1778 1779
			__set_current_state(TASK_RUNNING);
		}
1780
	}
1781 1782 1783 1784 1785
}

static int transaction_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1786
	struct btrfs_fs_info *fs_info = root->fs_info;
1787 1788
	struct btrfs_trans_handle *trans;
	struct btrfs_transaction *cur;
1789
	u64 transid;
1790
	time64_t now;
1791
	unsigned long delay;
1792
	bool cannot_commit;
1793 1794

	do {
1795
		cannot_commit = false;
1796 1797
		delay = HZ * fs_info->commit_interval;
		mutex_lock(&fs_info->transaction_kthread_mutex);
1798

1799 1800
		spin_lock(&fs_info->trans_lock);
		cur = fs_info->running_transaction;
1801
		if (!cur) {
1802
			spin_unlock(&fs_info->trans_lock);
1803 1804
			goto sleep;
		}
Y
Yan Zheng 已提交
1805

1806
		now = ktime_get_seconds();
1807
		if (cur->state < TRANS_STATE_COMMIT_START &&
1808
		    !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
1809
		    (now < cur->start_time ||
1810 1811
		     now - cur->start_time < fs_info->commit_interval)) {
			spin_unlock(&fs_info->trans_lock);
1812 1813 1814
			delay = HZ * 5;
			goto sleep;
		}
1815
		transid = cur->transid;
1816
		spin_unlock(&fs_info->trans_lock);
1817

1818
		/* If the file system is aborted, this will always fail. */
1819
		trans = btrfs_attach_transaction(root);
1820
		if (IS_ERR(trans)) {
1821 1822
			if (PTR_ERR(trans) != -ENOENT)
				cannot_commit = true;
1823
			goto sleep;
1824
		}
1825
		if (transid == trans->transid) {
1826
			btrfs_commit_transaction(trans);
1827
		} else {
1828
			btrfs_end_transaction(trans);
1829
		}
1830
sleep:
1831 1832
		wake_up_process(fs_info->cleaner_kthread);
		mutex_unlock(&fs_info->transaction_kthread_mutex);
1833

J
Josef Bacik 已提交
1834
		if (unlikely(test_bit(BTRFS_FS_STATE_ERROR,
1835
				      &fs_info->fs_state)))
1836
			btrfs_cleanup_transaction(fs_info);
1837
		if (!kthread_should_stop() &&
1838
				(!btrfs_transaction_blocked(fs_info) ||
1839
				 cannot_commit))
1840
			schedule_timeout_interruptible(delay);
1841 1842 1843 1844
	} while (!kthread_should_stop());
	return 0;
}

C
Chris Mason 已提交
1845
/*
1846 1847 1848
 * This will find the highest generation in the array of root backups.  The
 * index of the highest array is returned, or -EINVAL if we can't find
 * anything.
C
Chris Mason 已提交
1849 1850 1851 1852 1853
 *
 * We check to make sure the array is valid by comparing the
 * generation of the latest  root in the array with the generation
 * in the super block.  If they don't match we pitch it.
 */
1854
static int find_newest_super_backup(struct btrfs_fs_info *info)
C
Chris Mason 已提交
1855
{
1856
	const u64 newest_gen = btrfs_super_generation(info->super_copy);
C
Chris Mason 已提交
1857 1858 1859 1860 1861 1862 1863 1864
	u64 cur;
	struct btrfs_root_backup *root_backup;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		root_backup = info->super_copy->super_roots + i;
		cur = btrfs_backup_tree_root_gen(root_backup);
		if (cur == newest_gen)
1865
			return i;
C
Chris Mason 已提交
1866 1867
	}

1868
	return -EINVAL;
C
Chris Mason 已提交
1869 1870 1871 1872 1873 1874 1875 1876 1877
}

/*
 * copy all the root pointers into the super backup array.
 * this will bump the backup pointer by one when it is
 * done
 */
static void backup_super_roots(struct btrfs_fs_info *info)
{
1878
	const int next_backup = info->backup_root_index;
C
Chris Mason 已提交
1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
	struct btrfs_root_backup *root_backup;

	root_backup = info->super_for_commit->super_roots + next_backup;

	/*
	 * make sure all of our padding and empty slots get zero filled
	 * regardless of which ones we use today
	 */
	memset(root_backup, 0, sizeof(*root_backup));

	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;

	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
	btrfs_set_backup_tree_root_gen(root_backup,
			       btrfs_header_generation(info->tree_root->node));

	btrfs_set_backup_tree_root_level(root_backup,
			       btrfs_header_level(info->tree_root->node));

	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
	btrfs_set_backup_chunk_root_gen(root_backup,
			       btrfs_header_generation(info->chunk_root->node));
	btrfs_set_backup_chunk_root_level(root_backup,
			       btrfs_header_level(info->chunk_root->node));

	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
	btrfs_set_backup_extent_root_gen(root_backup,
			       btrfs_header_generation(info->extent_root->node));
	btrfs_set_backup_extent_root_level(root_backup,
			       btrfs_header_level(info->extent_root->node));

1910 1911 1912 1913 1914 1915 1916 1917
	/*
	 * we might commit during log recovery, which happens before we set
	 * the fs_root.  Make sure it is valid before we fill it in.
	 */
	if (info->fs_root && info->fs_root->node) {
		btrfs_set_backup_fs_root(root_backup,
					 info->fs_root->node->start);
		btrfs_set_backup_fs_root_gen(root_backup,
C
Chris Mason 已提交
1918
			       btrfs_header_generation(info->fs_root->node));
1919
		btrfs_set_backup_fs_root_level(root_backup,
C
Chris Mason 已提交
1920
			       btrfs_header_level(info->fs_root->node));
1921
	}
C
Chris Mason 已提交
1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950

	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
	btrfs_set_backup_dev_root_gen(root_backup,
			       btrfs_header_generation(info->dev_root->node));
	btrfs_set_backup_dev_root_level(root_backup,
				       btrfs_header_level(info->dev_root->node));

	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
	btrfs_set_backup_csum_root_gen(root_backup,
			       btrfs_header_generation(info->csum_root->node));
	btrfs_set_backup_csum_root_level(root_backup,
			       btrfs_header_level(info->csum_root->node));

	btrfs_set_backup_total_bytes(root_backup,
			     btrfs_super_total_bytes(info->super_copy));
	btrfs_set_backup_bytes_used(root_backup,
			     btrfs_super_bytes_used(info->super_copy));
	btrfs_set_backup_num_devices(root_backup,
			     btrfs_super_num_devices(info->super_copy));

	/*
	 * if we don't copy this out to the super_copy, it won't get remembered
	 * for the next commit
	 */
	memcpy(&info->super_copy->super_roots,
	       &info->super_for_commit->super_roots,
	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}

N
Nikolay Borisov 已提交
1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
/*
 * read_backup_root - Reads a backup root based on the passed priority. Prio 0
 * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
 *
 * fs_info - filesystem whose backup roots need to be read
 * priority - priority of backup root required
 *
 * Returns backup root index on success and -EINVAL otherwise.
 */
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
	int backup_index = find_newest_super_backup(fs_info);
	struct btrfs_super_block *super = fs_info->super_copy;
	struct btrfs_root_backup *root_backup;

	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
		if (priority == 0)
			return backup_index;

		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
	} else {
		return -EINVAL;
	}

	root_backup = super->super_roots + backup_index;

	btrfs_set_super_generation(super,
				   btrfs_backup_tree_root_gen(root_backup));
	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
	btrfs_set_super_root_level(super,
				   btrfs_backup_tree_root_level(root_backup));
	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));

	/*
	 * Fixme: the total bytes and num_devices need to match or we should
	 * need a fsck
	 */
	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));

	return backup_index;
}

L
Liu Bo 已提交
1995 1996 1997
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
1998
	btrfs_destroy_workqueue(fs_info->fixup_workers);
1999
	btrfs_destroy_workqueue(fs_info->delalloc_workers);
2000
	btrfs_destroy_workqueue(fs_info->workers);
2001 2002
	btrfs_destroy_workqueue(fs_info->endio_workers);
	btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
2003
	btrfs_destroy_workqueue(fs_info->endio_repair_workers);
2004
	btrfs_destroy_workqueue(fs_info->rmw_workers);
2005 2006
	btrfs_destroy_workqueue(fs_info->endio_write_workers);
	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
2007
	btrfs_destroy_workqueue(fs_info->delayed_workers);
2008
	btrfs_destroy_workqueue(fs_info->caching_workers);
2009
	btrfs_destroy_workqueue(fs_info->readahead_workers);
2010
	btrfs_destroy_workqueue(fs_info->flush_workers);
2011
	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
2012 2013
	if (fs_info->discard_ctl.discard_workers)
		destroy_workqueue(fs_info->discard_ctl.discard_workers);
2014 2015 2016 2017 2018 2019 2020
	/*
	 * Now that all other work queues are destroyed, we can safely destroy
	 * the queues used for metadata I/O, since tasks from those other work
	 * queues can do metadata I/O operations.
	 */
	btrfs_destroy_workqueue(fs_info->endio_meta_workers);
	btrfs_destroy_workqueue(fs_info->endio_meta_write_workers);
L
Liu Bo 已提交
2021 2022
}

2023 2024 2025 2026 2027 2028 2029 2030 2031 2032
static void free_root_extent_buffers(struct btrfs_root *root)
{
	if (root) {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
		root->node = NULL;
		root->commit_root = NULL;
	}
}

C
Chris Mason 已提交
2033
/* helper to cleanup tree roots */
2034
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
C
Chris Mason 已提交
2035
{
2036
	free_root_extent_buffers(info->tree_root);
2037

2038 2039 2040 2041 2042
	free_root_extent_buffers(info->dev_root);
	free_root_extent_buffers(info->extent_root);
	free_root_extent_buffers(info->csum_root);
	free_root_extent_buffers(info->quota_root);
	free_root_extent_buffers(info->uuid_root);
2043
	if (free_chunk_root)
2044
		free_root_extent_buffers(info->chunk_root);
2045
	free_root_extent_buffers(info->free_space_root);
C
Chris Mason 已提交
2046 2047
}

2048
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
2049 2050 2051 2052 2053 2054 2055 2056 2057 2058
{
	int ret;
	struct btrfs_root *gang[8];
	int i;

	while (!list_empty(&fs_info->dead_roots)) {
		gang[0] = list_entry(fs_info->dead_roots.next,
				     struct btrfs_root, root_list);
		list_del(&gang[0]->root_list);

2059
		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) {
2060
			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
2061 2062 2063
		} else {
			free_extent_buffer(gang[0]->node);
			free_extent_buffer(gang[0]->commit_root);
2064
			btrfs_put_root(gang[0]);
2065 2066 2067 2068 2069 2070 2071 2072 2073 2074
		}
	}

	while (1) {
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, 0,
					     ARRAY_SIZE(gang));
		if (!ret)
			break;
		for (i = 0; i < ret; i++)
2075
			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
2076
	}
2077 2078 2079

	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
		btrfs_free_log_root_tree(NULL, fs_info);
2080
		btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
2081
	}
2082
}
C
Chris Mason 已提交
2083

2084 2085 2086 2087 2088 2089 2090 2091
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->scrub_lock);
	atomic_set(&fs_info->scrubs_running, 0);
	atomic_set(&fs_info->scrub_pause_req, 0);
	atomic_set(&fs_info->scrubs_paused, 0);
	atomic_set(&fs_info->scrub_cancel_req, 0);
	init_waitqueue_head(&fs_info->scrub_pause_wait);
2092
	refcount_set(&fs_info->scrub_workers_refcnt, 0);
2093 2094
}

2095 2096 2097 2098 2099 2100 2101 2102 2103 2104
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->balance_lock);
	mutex_init(&fs_info->balance_mutex);
	atomic_set(&fs_info->balance_pause_req, 0);
	atomic_set(&fs_info->balance_cancel_req, 0);
	fs_info->balance_ctl = NULL;
	init_waitqueue_head(&fs_info->balance_wait_q);
}

2105
static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
2106
{
2107 2108 2109 2110
	struct inode *inode = fs_info->btree_inode;

	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
	set_nlink(inode, 1);
2111 2112 2113 2114 2115
	/*
	 * we set the i_size on the btree inode to the max possible int.
	 * the real end of the address space is determined by all of
	 * the devices in the system
	 */
2116 2117
	inode->i_size = OFFSET_MAX;
	inode->i_mapping->a_ops = &btree_aops;
2118

2119
	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
2120 2121
	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
			    IO_TREE_INODE_IO, inode);
2122
	BTRFS_I(inode)->io_tree.track_uptodate = false;
2123
	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
2124

2125
	BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops;
2126

2127 2128 2129 2130
	BTRFS_I(inode)->root = fs_info->tree_root;
	memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key));
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
	btrfs_insert_inode_hash(inode);
2131 2132
}

2133 2134 2135
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
2136
	init_rwsem(&fs_info->dev_replace.rwsem);
2137
	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
2138 2139
}

2140 2141 2142 2143 2144 2145 2146 2147
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->qgroup_lock);
	mutex_init(&fs_info->qgroup_ioctl_lock);
	fs_info->qgroup_tree = RB_ROOT;
	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
	fs_info->qgroup_seq = 1;
	fs_info->qgroup_ulist = NULL;
2148
	fs_info->qgroup_rescan_running = false;
2149 2150 2151
	mutex_init(&fs_info->qgroup_rescan_lock);
}

2152 2153 2154
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
		struct btrfs_fs_devices *fs_devices)
{
2155
	u32 max_active = fs_info->thread_pool_size;
2156
	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
2157 2158

	fs_info->workers =
2159 2160
		btrfs_alloc_workqueue(fs_info, "worker",
				      flags | WQ_HIGHPRI, max_active, 16);
2161 2162

	fs_info->delalloc_workers =
2163 2164
		btrfs_alloc_workqueue(fs_info, "delalloc",
				      flags, max_active, 2);
2165 2166

	fs_info->flush_workers =
2167 2168
		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				      flags, max_active, 0);
2169 2170

	fs_info->caching_workers =
2171
		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
2172 2173

	fs_info->fixup_workers =
2174
		btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
2175 2176 2177 2178 2179 2180

	/*
	 * endios are largely parallel and should have a very
	 * low idle thresh
	 */
	fs_info->endio_workers =
2181
		btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
2182
	fs_info->endio_meta_workers =
2183 2184
		btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
				      max_active, 4);
2185
	fs_info->endio_meta_write_workers =
2186 2187
		btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
				      max_active, 2);
2188
	fs_info->endio_raid56_workers =
2189 2190
		btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
				      max_active, 4);
2191
	fs_info->endio_repair_workers =
2192
		btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
2193
	fs_info->rmw_workers =
2194
		btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
2195
	fs_info->endio_write_workers =
2196 2197
		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				      max_active, 2);
2198
	fs_info->endio_freespace_worker =
2199 2200
		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				      max_active, 0);
2201
	fs_info->delayed_workers =
2202 2203
		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				      max_active, 0);
2204
	fs_info->readahead_workers =
2205 2206
		btrfs_alloc_workqueue(fs_info, "readahead", flags,
				      max_active, 2);
2207
	fs_info->qgroup_rescan_workers =
2208
		btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
2209 2210
	fs_info->discard_ctl.discard_workers =
		alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
2211 2212

	if (!(fs_info->workers && fs_info->delalloc_workers &&
2213
	      fs_info->flush_workers &&
2214 2215 2216 2217 2218 2219 2220
	      fs_info->endio_workers && fs_info->endio_meta_workers &&
	      fs_info->endio_meta_write_workers &&
	      fs_info->endio_repair_workers &&
	      fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
	      fs_info->caching_workers && fs_info->readahead_workers &&
	      fs_info->fixup_workers && fs_info->delayed_workers &&
2221 2222
	      fs_info->qgroup_rescan_workers &&
	      fs_info->discard_ctl.discard_workers)) {
2223 2224 2225 2226 2227 2228
		return -ENOMEM;
	}

	return 0;
}

2229 2230 2231
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
	struct crypto_shash *csum_shash;
2232
	const char *csum_driver = btrfs_super_csum_driver(csum_type);
2233

2234
	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2235 2236 2237

	if (IS_ERR(csum_shash)) {
		btrfs_err(fs_info, "error allocating %s hash for checksum",
2238
			  csum_driver);
2239 2240 2241 2242 2243 2244 2245 2246
		return PTR_ERR(csum_shash);
	}

	fs_info->csum_shash = csum_shash;

	return 0;
}

2247 2248 2249 2250 2251 2252 2253
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
			    struct btrfs_fs_devices *fs_devices)
{
	int ret;
	struct btrfs_root *log_tree_root;
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	u64 bytenr = btrfs_super_log_root(disk_super);
2254
	int level = btrfs_super_log_root_level(disk_super);
2255 2256

	if (fs_devices->rw_devices == 0) {
2257
		btrfs_warn(fs_info, "log replay required on RO media");
2258 2259 2260
		return -EIO;
	}

2261 2262
	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
					 GFP_KERNEL);
2263 2264 2265
	if (!log_tree_root)
		return -ENOMEM;

2266
	log_tree_root->node = read_tree_block(fs_info, bytenr,
2267 2268
					      fs_info->generation + 1,
					      level, NULL);
2269
	if (IS_ERR(log_tree_root->node)) {
2270
		btrfs_warn(fs_info, "failed to read log tree");
2271
		ret = PTR_ERR(log_tree_root->node);
2272
		btrfs_put_root(log_tree_root);
2273
		return ret;
2274
	} else if (!extent_buffer_uptodate(log_tree_root->node)) {
2275
		btrfs_err(fs_info, "failed to read log tree");
2276
		free_extent_buffer(log_tree_root->node);
2277
		btrfs_put_root(log_tree_root);
2278 2279 2280 2281 2282
		return -EIO;
	}
	/* returns with log_tree_root freed on success */
	ret = btrfs_recover_log_trees(log_tree_root);
	if (ret) {
2283 2284
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to recover log tree");
2285
		free_extent_buffer(log_tree_root->node);
2286
		btrfs_put_root(log_tree_root);
2287 2288 2289
		return ret;
	}

2290
	if (sb_rdonly(fs_info->sb)) {
2291
		ret = btrfs_commit_super(fs_info);
2292 2293 2294 2295 2296 2297 2298
		if (ret)
			return ret;
	}

	return 0;
}

2299
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2300
{
2301
	struct btrfs_root *tree_root = fs_info->tree_root;
2302
	struct btrfs_root *root;
2303 2304 2305
	struct btrfs_key location;
	int ret;

2306 2307
	BUG_ON(!fs_info->tree_root);

2308 2309 2310 2311
	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
	location.type = BTRFS_ROOT_ITEM_KEY;
	location.offset = 0;

2312
	root = btrfs_read_tree_root(tree_root, &location);
2313 2314 2315 2316
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2317 2318
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->extent_root = root;
2319 2320

	location.objectid = BTRFS_DEV_TREE_OBJECTID;
2321
	root = btrfs_read_tree_root(tree_root, &location);
2322 2323 2324 2325
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2326 2327
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->dev_root = root;
2328 2329 2330
	btrfs_init_devices_late(fs_info);

	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
2331
	root = btrfs_read_tree_root(tree_root, &location);
2332 2333 2334 2335
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
		goto out;
	}
2336 2337
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
	fs_info->csum_root = root;
2338 2339

	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2340 2341 2342
	root = btrfs_read_tree_root(tree_root, &location);
	if (!IS_ERR(root)) {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
2343
		set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
2344
		fs_info->quota_root = root;
2345 2346 2347
	}

	location.objectid = BTRFS_UUID_TREE_OBJECTID;
2348 2349 2350
	root = btrfs_read_tree_root(tree_root, &location);
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);
2351
		if (ret != -ENOENT)
2352
			goto out;
2353
	} else {
2354 2355
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->uuid_root = root;
2356 2357
	}

2358 2359 2360
	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
2361 2362 2363 2364
		if (IS_ERR(root)) {
			ret = PTR_ERR(root);
			goto out;
		}
2365 2366 2367 2368
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->free_space_root = root;
	}

2369
	return 0;
2370 2371 2372 2373
out:
	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
		   location.objectid, ret);
	return ret;
2374 2375
}

2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
/*
 * Real super block validation
 * NOTE: super csum type and incompat features will not be checked here.
 *
 * @sb:		super block to check
 * @mirror_num:	the super block number to check its bytenr:
 * 		0	the primary (1st) sb
 * 		1, 2	2nd and 3rd backup copy
 * 	       -1	skip bytenr check
 */
static int validate_super(struct btrfs_fs_info *fs_info,
			    struct btrfs_super_block *sb, int mirror_num)
2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461
{
	u64 nodesize = btrfs_super_nodesize(sb);
	u64 sectorsize = btrfs_super_sectorsize(sb);
	int ret = 0;

	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
		btrfs_err(fs_info, "no valid FS found");
		ret = -EINVAL;
	}
	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
		ret = -EINVAL;
	}
	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "log_root level too big: %d >= %d",
				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}

	/*
	 * Check sectorsize and nodesize first, other check will need it.
	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
	 */
	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
		ret = -EINVAL;
	}
	/* Only PAGE SIZE is supported yet */
	if (sectorsize != PAGE_SIZE) {
		btrfs_err(fs_info,
			"sectorsize %llu not supported yet, only support %lu",
			sectorsize, PAGE_SIZE);
		ret = -EINVAL;
	}
	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
		ret = -EINVAL;
	}
	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
			  le32_to_cpu(sb->__unused_leafsize), nodesize);
		ret = -EINVAL;
	}

	/* Root alignment check */
	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
			   btrfs_super_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
			   btrfs_super_chunk_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "log_root block unaligned: %llu",
			   btrfs_super_log_root(sb));
		ret = -EINVAL;
	}

2462
	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
2463
		   BTRFS_FSID_SIZE) != 0) {
2464
		btrfs_err(fs_info,
2465
			"dev_item UUID does not match metadata fsid: %pU != %pU",
2466
			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491
		ret = -EINVAL;
	}

	/*
	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
	 * done later
	 */
	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
		btrfs_err(fs_info, "bytes_used is too small %llu",
			  btrfs_super_bytes_used(sb));
		ret = -EINVAL;
	}
	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
		btrfs_err(fs_info, "invalid stripesize %u",
			  btrfs_super_stripesize(sb));
		ret = -EINVAL;
	}
	if (btrfs_super_num_devices(sb) > (1UL << 31))
		btrfs_warn(fs_info, "suspicious number of devices: %llu",
			   btrfs_super_num_devices(sb));
	if (btrfs_super_num_devices(sb) == 0) {
		btrfs_err(fs_info, "number of devices is 0");
		ret = -EINVAL;
	}

2492 2493
	if (mirror_num >= 0 &&
	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536
		btrfs_err(fs_info, "super offset mismatch %llu != %u",
			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
		ret = -EINVAL;
	}

	/*
	 * Obvious sys_chunk_array corruptions, it must hold at least one key
	 * and one chunk
	 */
	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
		btrfs_err(fs_info, "system chunk array too big %u > %u",
			  btrfs_super_sys_array_size(sb),
			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
		ret = -EINVAL;
	}
	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
			+ sizeof(struct btrfs_chunk)) {
		btrfs_err(fs_info, "system chunk array too small %u < %zu",
			  btrfs_super_sys_array_size(sb),
			  sizeof(struct btrfs_disk_key)
			  + sizeof(struct btrfs_chunk));
		ret = -EINVAL;
	}

	/*
	 * The generation is a global counter, we'll trust it more than the others
	 * but it's still possible that it's the one that's wrong.
	 */
	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
		btrfs_warn(fs_info,
			"suspicious: generation < chunk_root_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_chunk_root_generation(sb));
	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
	    && btrfs_super_cache_generation(sb) != (u64)-1)
		btrfs_warn(fs_info,
			"suspicious: generation < cache_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_cache_generation(sb));

	return ret;
}

2537 2538 2539 2540 2541 2542 2543 2544 2545 2546
/*
 * Validation of super block at mount time.
 * Some checks already done early at mount time, like csum type and incompat
 * flags will be skipped.
 */
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
	return validate_super(fs_info, fs_info->super_copy, 0);
}

2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560
/*
 * Validation of super block at write time.
 * Some checks like bytenr check will be skipped as their values will be
 * overwritten soon.
 * Extra checks like csum type and incompat flags will be done here.
 */
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
				      struct btrfs_super_block *sb)
{
	int ret;

	ret = validate_super(fs_info, sb, -1);
	if (ret < 0)
		goto out;
2561
	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581
		ret = -EUCLEAN;
		btrfs_err(fs_info, "invalid csum type, has %u want %u",
			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
		goto out;
	}
	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
		ret = -EUCLEAN;
		btrfs_err(fs_info,
		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
			  btrfs_super_incompat_flags(sb),
			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
		goto out;
	}
out:
	if (ret < 0)
		btrfs_err(fs_info,
		"super block corruption detected before writing it to disk");
	return ret;
}

2582
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2583
{
2584
	int backup_index = find_newest_super_backup(fs_info);
2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614
	struct btrfs_super_block *sb = fs_info->super_copy;
	struct btrfs_root *tree_root = fs_info->tree_root;
	bool handle_error = false;
	int ret = 0;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		u64 generation;
		int level;

		if (handle_error) {
			if (!IS_ERR(tree_root->node))
				free_extent_buffer(tree_root->node);
			tree_root->node = NULL;

			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				break;

			free_root_pointers(fs_info, 0);

			/*
			 * Don't use the log in recovery mode, it won't be
			 * valid
			 */
			btrfs_set_super_log_root(sb, 0);

			/* We can't trust the free space cache either */
			btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);

			ret = read_backup_root(fs_info, i);
2615
			backup_index = ret;
2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639
			if (ret < 0)
				return ret;
		}
		generation = btrfs_super_generation(sb);
		level = btrfs_super_root_level(sb);
		tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb),
						  generation, level, NULL);
		if (IS_ERR(tree_root->node) ||
		    !extent_buffer_uptodate(tree_root->node)) {
			handle_error = true;

			if (IS_ERR(tree_root->node))
				ret = PTR_ERR(tree_root->node);
			else if (!extent_buffer_uptodate(tree_root->node))
				ret = -EUCLEAN;

			btrfs_warn(fs_info, "failed to read tree root");
			continue;
		}

		btrfs_set_root_node(&tree_root->root_item, tree_root->node);
		tree_root->commit_root = btrfs_root_node(tree_root);
		btrfs_set_root_refs(&tree_root->root_item, 1);

2640 2641 2642 2643
		/*
		 * No need to hold btrfs_root::objectid_mutex since the fs
		 * hasn't been fully initialised and we are the only user
		 */
2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661
		ret = btrfs_find_highest_objectid(tree_root,
						&tree_root->highest_objectid);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);

		ret = btrfs_read_roots(fs_info);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		/* All successful */
		fs_info->generation = generation;
		fs_info->last_trans_committed = generation;
2662 2663 2664 2665 2666 2667 2668 2669

		/* Always begin writing backup roots after the one being used */
		if (backup_index < 0) {
			fs_info->backup_root_index = 0;
		} else {
			fs_info->backup_root_index = backup_index + 1;
			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
		}
2670 2671 2672 2673 2674 2675
		break;
	}

	return ret;
}

2676
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2677
{
2678
	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2679
	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
C
Chris Mason 已提交
2680
	INIT_LIST_HEAD(&fs_info->trans_list);
2681
	INIT_LIST_HEAD(&fs_info->dead_roots);
Y
Yan, Zheng 已提交
2682
	INIT_LIST_HEAD(&fs_info->delayed_iputs);
2683
	INIT_LIST_HEAD(&fs_info->delalloc_roots);
2684
	INIT_LIST_HEAD(&fs_info->caching_block_groups);
2685
	spin_lock_init(&fs_info->delalloc_root_lock);
J
Josef Bacik 已提交
2686
	spin_lock_init(&fs_info->trans_lock);
2687
	spin_lock_init(&fs_info->fs_roots_radix_lock);
Y
Yan, Zheng 已提交
2688
	spin_lock_init(&fs_info->delayed_iput_lock);
C
Chris Mason 已提交
2689
	spin_lock_init(&fs_info->defrag_inodes_lock);
2690
	spin_lock_init(&fs_info->super_lock);
2691
	spin_lock_init(&fs_info->buffer_lock);
2692
	spin_lock_init(&fs_info->unused_bgs_lock);
J
Jan Schmidt 已提交
2693
	rwlock_init(&fs_info->tree_mod_log_lock);
2694
	mutex_init(&fs_info->unused_bg_unpin_mutex);
2695
	mutex_init(&fs_info->delete_unused_bgs_mutex);
C
Chris Mason 已提交
2696
	mutex_init(&fs_info->reloc_mutex);
2697
	mutex_init(&fs_info->delalloc_root_mutex);
2698
	seqlock_init(&fs_info->profiles_lock);
2699

2700
	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2701
	INIT_LIST_HEAD(&fs_info->space_info);
J
Jan Schmidt 已提交
2702
	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2703
	INIT_LIST_HEAD(&fs_info->unused_bgs);
J
Josef Bacik 已提交
2704 2705 2706
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&fs_info->allocated_roots);
#endif
2707
	extent_map_tree_init(&fs_info->mapping_tree);
2708 2709 2710 2711 2712 2713 2714
	btrfs_init_block_rsv(&fs_info->global_block_rsv,
			     BTRFS_BLOCK_RSV_GLOBAL);
	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
			     BTRFS_BLOCK_RSV_DELOPS);
J
Josef Bacik 已提交
2715 2716 2717
	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
			     BTRFS_BLOCK_RSV_DELREFS);

2718
	atomic_set(&fs_info->async_delalloc_pages, 0);
C
Chris Mason 已提交
2719
	atomic_set(&fs_info->defrag_running, 0);
Z
Zhao Lei 已提交
2720
	atomic_set(&fs_info->reada_works_cnt, 0);
2721
	atomic_set(&fs_info->nr_delayed_iputs, 0);
2722
	atomic64_set(&fs_info->tree_mod_seq, 0);
2723
	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
J
Josef Bacik 已提交
2724
	fs_info->metadata_ratio = 0;
C
Chris Mason 已提交
2725
	fs_info->defrag_inodes = RB_ROOT;
2726
	atomic64_set(&fs_info->free_chunk_space, 0);
J
Jan Schmidt 已提交
2727
	fs_info->tree_mod_log = RB_ROOT;
2728
	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2729
	fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
2730
	/* readahead state */
2731
	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
2732
	spin_lock_init(&fs_info->reada_lock);
J
Josef Bacik 已提交
2733
	btrfs_init_ref_verify(fs_info);
C
Chris Mason 已提交
2734

2735 2736
	fs_info->thread_pool_size = min_t(unsigned long,
					  num_online_cpus() + 2, 8);
2737

2738 2739
	INIT_LIST_HEAD(&fs_info->ordered_roots);
	spin_lock_init(&fs_info->ordered_root_lock);
2740

2741
	btrfs_init_scrub(fs_info);
2742 2743 2744
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
	fs_info->check_integrity_print_mask = 0;
#endif
2745
	btrfs_init_balance(fs_info);
2746
	btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work);
A
Arne Jansen 已提交
2747

J
Josef Bacik 已提交
2748
	spin_lock_init(&fs_info->block_group_cache_lock);
2749
	fs_info->block_group_cache_tree = RB_ROOT;
2750
	fs_info->first_logical_byte = (u64)-1;
J
Josef Bacik 已提交
2751

2752 2753 2754 2755
	extent_io_tree_init(fs_info, &fs_info->freed_extents[0],
			    IO_TREE_FS_INFO_FREED_EXTENTS0, NULL);
	extent_io_tree_init(fs_info, &fs_info->freed_extents[1],
			    IO_TREE_FS_INFO_FREED_EXTENTS1, NULL);
2756
	fs_info->pinned_extents = &fs_info->freed_extents[0];
2757
	set_bit(BTRFS_FS_BARRIER, &fs_info->flags);
C
Chris Mason 已提交
2758

2759
	mutex_init(&fs_info->ordered_operations_mutex);
2760
	mutex_init(&fs_info->tree_log_mutex);
2761
	mutex_init(&fs_info->chunk_mutex);
2762 2763
	mutex_init(&fs_info->transaction_kthread_mutex);
	mutex_init(&fs_info->cleaner_mutex);
2764
	mutex_init(&fs_info->ro_block_group_mutex);
2765
	init_rwsem(&fs_info->commit_root_sem);
2766
	init_rwsem(&fs_info->cleanup_work_sem);
2767
	init_rwsem(&fs_info->subvol_sem);
S
Stefan Behrens 已提交
2768
	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2769

2770
	btrfs_init_dev_replace_locks(fs_info);
2771
	btrfs_init_qgroup(fs_info);
2772
	btrfs_discard_init(fs_info);
2773

2774 2775 2776
	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

2777
	init_waitqueue_head(&fs_info->transaction_throttle);
2778
	init_waitqueue_head(&fs_info->transaction_wait);
S
Sage Weil 已提交
2779
	init_waitqueue_head(&fs_info->transaction_blocked_wait);
2780
	init_waitqueue_head(&fs_info->async_submit_wait);
2781
	init_waitqueue_head(&fs_info->delayed_iputs_wait);
2782

2783 2784 2785 2786 2787
	/* Usable values until the real ones are cached from the superblock */
	fs_info->nodesize = 4096;
	fs_info->sectorsize = 4096;
	fs_info->stripesize = 4096;

2788 2789 2790
	spin_lock_init(&fs_info->swapfile_pins_lock);
	fs_info->swapfile_pins = RB_ROOT;

2791
	fs_info->send_in_progress = 0;
2792 2793 2794 2795 2796 2797 2798 2799 2800
}

static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
{
	int ret;

	fs_info->sb = sb;
	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2801

2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833
	ret = init_srcu_struct(&fs_info->subvol_srcu);
	if (ret)
		return ret;

	ret = percpu_counter_init(&fs_info->dio_bytes, 0, GFP_KERNEL);
	if (ret)
		goto fail;

	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
	if (ret)
		goto fail;

	fs_info->dirty_metadata_batch = PAGE_SIZE *
					(1 + ilog2(nr_cpu_ids));

	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
	if (ret)
		goto fail;

	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
			GFP_KERNEL);
	if (ret)
		goto fail;

	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
					GFP_KERNEL);
	if (!fs_info->delayed_root) {
		ret = -ENOMEM;
		goto fail;
	}
	btrfs_init_delayed_root(fs_info->delayed_root);

D
David Woodhouse 已提交
2834
	ret = btrfs_alloc_stripe_hash_table(fs_info);
2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863
	if (ret)
		goto fail;

	return 0;
fail:
	cleanup_srcu_struct(&fs_info->subvol_srcu);
	return ret;
}

int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
		      char *options)
{
	u32 sectorsize;
	u32 nodesize;
	u32 stripesize;
	u64 generation;
	u64 features;
	u16 csum_type;
	struct btrfs_key location;
	struct buffer_head *bh;
	struct btrfs_super_block *disk_super;
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *tree_root;
	struct btrfs_root *chunk_root;
	int ret;
	int err = -EINVAL;
	int clear_free_space_tree = 0;
	int level;

2864
	ret = init_mount_fs_info(fs_info, sb);
D
David Woodhouse 已提交
2865
	if (ret) {
2866
		err = ret;
2867
		goto fail;
D
David Woodhouse 已提交
2868 2869
	}

2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889
	/* These need to be init'ed before we start creating inodes and such. */
	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
				     GFP_KERNEL);
	fs_info->tree_root = tree_root;
	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
				      GFP_KERNEL);
	fs_info->chunk_root = chunk_root;
	if (!tree_root || !chunk_root) {
		err = -ENOMEM;
		goto fail_srcu;
	}

	fs_info->btree_inode = new_inode(sb);
	if (!fs_info->btree_inode) {
		err = -ENOMEM;
		goto fail_srcu;
	}
	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
	btrfs_init_btree_inode(fs_info);

2890
	invalidate_bdev(fs_devices->latest_bdev);
D
David Sterba 已提交
2891 2892 2893 2894

	/*
	 * Read super block and check the signature bytes only
	 */
Y
Yan Zheng 已提交
2895
	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2896 2897
	if (IS_ERR(bh)) {
		err = PTR_ERR(bh);
2898
		goto fail_alloc;
2899
	}
C
Chris Mason 已提交
2900

2901 2902 2903 2904
	/*
	 * Verify the type first, if that or the the checksum value are
	 * corrupted, we'll find out
	 */
2905 2906
	csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
	if (!btrfs_supported_super_csum(csum_type)) {
2907
		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
2908
			  csum_type);
2909 2910 2911 2912 2913
		err = -EINVAL;
		brelse(bh);
		goto fail_alloc;
	}

2914 2915 2916 2917 2918 2919
	ret = btrfs_init_csum_hash(fs_info, csum_type);
	if (ret) {
		err = ret;
		goto fail_alloc;
	}

D
David Sterba 已提交
2920 2921 2922 2923
	/*
	 * We want to check superblock checksum, the type is stored inside.
	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
	 */
2924
	if (btrfs_check_super_csum(fs_info, bh->b_data)) {
2925
		btrfs_err(fs_info, "superblock checksum mismatch");
D
David Sterba 已提交
2926
		err = -EINVAL;
2927
		brelse(bh);
2928
		goto fail_alloc;
D
David Sterba 已提交
2929 2930 2931 2932 2933 2934 2935
	}

	/*
	 * super_copy is zeroed at allocation time and we never touch the
	 * following bytes up to INFO_SIZE, the checksum is calculated from
	 * the whole block of INFO_SIZE
	 */
2936
	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
2937
	brelse(bh);
2938

2939 2940
	disk_super = fs_info->super_copy;

2941 2942 2943
	ASSERT(!memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
		       BTRFS_FSID_SIZE));

2944
	if (btrfs_fs_incompat(fs_info, METADATA_UUID)) {
2945 2946 2947
		ASSERT(!memcmp(fs_info->fs_devices->metadata_uuid,
				fs_info->super_copy->metadata_uuid,
				BTRFS_FSID_SIZE));
2948
	}
2949

2950 2951 2952 2953 2954 2955 2956 2957 2958 2959
	features = btrfs_super_flags(disk_super);
	if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
		features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2;
		btrfs_set_super_flags(disk_super, features);
		btrfs_info(fs_info,
			"found metadata UUID change in progress flag, clearing");
	}

	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_for_commit));
2960

2961
	ret = btrfs_validate_mount_super(fs_info);
D
David Sterba 已提交
2962
	if (ret) {
2963
		btrfs_err(fs_info, "superblock contains fatal errors");
D
David Sterba 已提交
2964
		err = -EINVAL;
2965
		goto fail_alloc;
D
David Sterba 已提交
2966 2967
	}

2968
	if (!btrfs_super_root(disk_super))
2969
		goto fail_alloc;
2970

L
liubo 已提交
2971
	/* check FS state, whether FS is broken. */
2972 2973
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
L
liubo 已提交
2974

2975 2976 2977 2978 2979 2980
	/*
	 * In the long term, we'll store the compression type in the super
	 * block, and it'll be used for per file compression control.
	 */
	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;

2981
	ret = btrfs_parse_options(fs_info, options, sb->s_flags);
Y
Yan Zheng 已提交
2982 2983
	if (ret) {
		err = ret;
2984
		goto fail_alloc;
Y
Yan Zheng 已提交
2985
	}
2986

2987 2988 2989
	features = btrfs_super_incompat_flags(disk_super) &
		~BTRFS_FEATURE_INCOMPAT_SUPP;
	if (features) {
2990 2991 2992
		btrfs_err(fs_info,
		    "cannot mount because of unsupported optional features (%llx)",
		    features);
2993
		err = -EINVAL;
2994
		goto fail_alloc;
2995 2996
	}

2997
	features = btrfs_super_incompat_flags(disk_super);
L
Li Zefan 已提交
2998
	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
2999
	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
L
Li Zefan 已提交
3000
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
N
Nick Terrell 已提交
3001 3002
	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;
3003

3004
	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
3005
		btrfs_info(fs_info, "has skinny extents");
3006

3007 3008 3009 3010
	/*
	 * flag our filesystem as having big metadata blocks if
	 * they are bigger than the page size
	 */
3011
	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
3012
		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
3013 3014
			btrfs_info(fs_info,
				"flagging fs with big metadata feature");
3015 3016 3017
		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
	}

3018 3019
	nodesize = btrfs_super_nodesize(disk_super);
	sectorsize = btrfs_super_sectorsize(disk_super);
3020
	stripesize = sectorsize;
3021
	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
3022
	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
3023

3024 3025 3026 3027 3028
	/* Cache block sizes */
	fs_info->nodesize = nodesize;
	fs_info->sectorsize = sectorsize;
	fs_info->stripesize = stripesize;

3029 3030 3031 3032 3033
	/*
	 * mixed block groups end up with duplicate but slightly offset
	 * extent buffers for the same range.  It leads to corruptions
	 */
	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
3034
	    (sectorsize != nodesize)) {
3035 3036 3037
		btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
			nodesize, sectorsize);
3038
		goto fail_alloc;
3039 3040
	}

3041 3042 3043 3044
	/*
	 * Needn't use the lock because there is no other task which will
	 * update the flag.
	 */
L
Li Zefan 已提交
3045
	btrfs_set_super_incompat_flags(disk_super, features);
3046

3047 3048
	features = btrfs_super_compat_ro_flags(disk_super) &
		~BTRFS_FEATURE_COMPAT_RO_SUPP;
3049
	if (!sb_rdonly(sb) && features) {
3050 3051
		btrfs_err(fs_info,
	"cannot mount read-write because of unsupported optional features (%llx)",
3052
		       features);
3053
		err = -EINVAL;
3054
		goto fail_alloc;
3055
	}
3056

3057 3058 3059
	ret = btrfs_init_workqueues(fs_info, fs_devices);
	if (ret) {
		err = ret;
3060 3061
		goto fail_sb_buffer;
	}
3062

3063 3064 3065
	sb->s_bdi->congested_fn = btrfs_congested_fn;
	sb->s_bdi->congested_data = fs_info;
	sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK;
3066
	sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
3067 3068
	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3069

3070 3071
	sb->s_blocksize = sectorsize;
	sb->s_blocksize_bits = blksize_bits(sectorsize);
3072
	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3073

3074
	mutex_lock(&fs_info->chunk_mutex);
3075
	ret = btrfs_read_sys_array(fs_info);
3076
	mutex_unlock(&fs_info->chunk_mutex);
3077
	if (ret) {
3078
		btrfs_err(fs_info, "failed to read the system array: %d", ret);
3079
		goto fail_sb_buffer;
3080
	}
3081

3082
	generation = btrfs_super_chunk_root_generation(disk_super);
3083
	level = btrfs_super_chunk_root_level(disk_super);
3084

3085
	chunk_root->node = read_tree_block(fs_info,
3086
					   btrfs_super_chunk_root(disk_super),
3087
					   generation, level, NULL);
3088 3089
	if (IS_ERR(chunk_root->node) ||
	    !extent_buffer_uptodate(chunk_root->node)) {
3090
		btrfs_err(fs_info, "failed to read chunk root");
3091 3092
		if (!IS_ERR(chunk_root->node))
			free_extent_buffer(chunk_root->node);
3093
		chunk_root->node = NULL;
C
Chris Mason 已提交
3094
		goto fail_tree_roots;
3095
	}
3096 3097
	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
	chunk_root->commit_root = btrfs_root_node(chunk_root);
3098

3099
	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3100
	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
3101

3102
	ret = btrfs_read_chunk_tree(fs_info);
Y
Yan Zheng 已提交
3103
	if (ret) {
3104
		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
C
Chris Mason 已提交
3105
		goto fail_tree_roots;
Y
Yan Zheng 已提交
3106
	}
3107

3108
	/*
3109 3110
	 * Keep the devid that is marked to be the target device for the
	 * device replace procedure
3111
	 */
3112
	btrfs_free_extra_devids(fs_devices, 0);
3113

3114
	if (!fs_devices->latest_bdev) {
3115
		btrfs_err(fs_info, "failed to read devices");
3116 3117 3118
		goto fail_tree_roots;
	}

3119
	ret = init_tree_roots(fs_info);
3120
	if (ret)
3121
		goto fail_tree_roots;
3122

3123 3124 3125 3126 3127 3128 3129
	ret = btrfs_verify_dev_extents(fs_info);
	if (ret) {
		btrfs_err(fs_info,
			  "failed to verify dev extents against chunks: %d",
			  ret);
		goto fail_block_groups;
	}
3130 3131
	ret = btrfs_recover_balance(fs_info);
	if (ret) {
3132
		btrfs_err(fs_info, "failed to recover balance: %d", ret);
3133 3134 3135
		goto fail_block_groups;
	}

3136 3137
	ret = btrfs_init_dev_stats(fs_info);
	if (ret) {
3138
		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3139 3140 3141
		goto fail_block_groups;
	}

3142 3143
	ret = btrfs_init_dev_replace(fs_info);
	if (ret) {
3144
		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3145 3146 3147
		goto fail_block_groups;
	}

3148
	btrfs_free_extra_devids(fs_devices, 1);
3149

3150
	ret = btrfs_sysfs_add_fsid(fs_devices);
3151
	if (ret) {
3152 3153
		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				ret);
3154 3155 3156
		goto fail_block_groups;
	}

3157
	ret = btrfs_sysfs_add_mounted(fs_info);
3158
	if (ret) {
3159
		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3160
		goto fail_fsdev_sysfs;
3161 3162 3163 3164
	}

	ret = btrfs_init_space_info(fs_info);
	if (ret) {
3165
		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3166
		goto fail_sysfs;
3167 3168
	}

3169
	ret = btrfs_read_block_groups(fs_info);
3170
	if (ret) {
3171
		btrfs_err(fs_info, "failed to read block groups: %d", ret);
3172
		goto fail_sysfs;
3173
	}
3174

3175
	if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
3176
		btrfs_warn(fs_info,
3177
		"writable mount is not allowed due to too many missing devices");
3178
		goto fail_sysfs;
3179
	}
C
Chris Mason 已提交
3180

3181 3182
	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
					       "btrfs-cleaner");
3183
	if (IS_ERR(fs_info->cleaner_kthread))
3184
		goto fail_sysfs;
3185 3186 3187 3188

	fs_info->transaction_kthread = kthread_run(transaction_kthread,
						   tree_root,
						   "btrfs-transaction");
3189
	if (IS_ERR(fs_info->transaction_kthread))
3190
		goto fail_cleaner;
3191

3192
	if (!btrfs_test_opt(fs_info, NOSSD) &&
C
Chris Mason 已提交
3193
	    !fs_info->fs_devices->rotating) {
3194
		btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
C
Chris Mason 已提交
3195 3196
	}

3197
	/*
3198
	 * Mount does not set all options immediately, we can do it now and do
3199 3200 3201
	 * not have to wait for transaction commit
	 */
	btrfs_apply_pending_changes(fs_info);
3202

3203
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
3204
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
3205
		ret = btrfsic_mount(fs_info, fs_devices,
3206
				    btrfs_test_opt(fs_info,
3207 3208 3209 3210
					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
				    1 : 0,
				    fs_info->check_integrity_print_mask);
		if (ret)
3211 3212 3213
			btrfs_warn(fs_info,
				"failed to initialize integrity check module: %d",
				ret);
3214 3215
	}
#endif
3216 3217 3218
	ret = btrfs_read_qgroup_config(fs_info);
	if (ret)
		goto fail_trans_kthread;
3219

J
Josef Bacik 已提交
3220 3221 3222
	if (btrfs_build_ref_tree(fs_info))
		btrfs_err(fs_info, "couldn't build ref tree");

3223 3224
	/* do not make disk changes in broken FS or nologreplay is given */
	if (btrfs_super_log_root(disk_super) != 0 &&
3225
	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3226
		btrfs_info(fs_info, "start tree-log replay");
3227
		ret = btrfs_replay_log(fs_info, fs_devices);
3228
		if (ret) {
3229
			err = ret;
3230
			goto fail_qgroup;
3231
		}
3232
	}
Z
Zheng Yan 已提交
3233

3234
	ret = btrfs_find_orphan_roots(fs_info);
3235
	if (ret)
3236
		goto fail_qgroup;
3237

3238
	if (!sb_rdonly(sb)) {
3239
		ret = btrfs_cleanup_fs_roots(fs_info);
3240
		if (ret)
3241
			goto fail_qgroup;
3242 3243

		mutex_lock(&fs_info->cleaner_mutex);
3244
		ret = btrfs_recover_relocation(tree_root);
3245
		mutex_unlock(&fs_info->cleaner_mutex);
3246
		if (ret < 0) {
3247 3248
			btrfs_warn(fs_info, "failed to recover relocation: %d",
					ret);
3249
			err = -EINVAL;
3250
			goto fail_qgroup;
3251
		}
3252
	}
Z
Zheng Yan 已提交
3253

3254 3255
	location.objectid = BTRFS_FS_TREE_OBJECTID;
	location.type = BTRFS_ROOT_ITEM_KEY;
3256
	location.offset = 0;
3257

3258
	fs_info->fs_root = btrfs_get_fs_root(fs_info, &location, true);
3259 3260
	if (IS_ERR(fs_info->fs_root)) {
		err = PTR_ERR(fs_info->fs_root);
3261
		btrfs_warn(fs_info, "failed to read fs tree: %d", err);
3262
		fs_info->fs_root = NULL;
3263
		goto fail_qgroup;
3264
	}
C
Chris Mason 已提交
3265

3266
	if (sb_rdonly(sb))
3267
		return 0;
I
Ilya Dryomov 已提交
3268

3269 3270
	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3271 3272 3273 3274 3275 3276 3277 3278
		clear_free_space_tree = 1;
	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
		btrfs_warn(fs_info, "free space tree is invalid");
		clear_free_space_tree = 1;
	}

	if (clear_free_space_tree) {
3279 3280 3281 3282 3283
		btrfs_info(fs_info, "clearing free space tree");
		ret = btrfs_clear_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to clear free space tree: %d", ret);
3284
			close_ctree(fs_info);
3285 3286 3287 3288
			return ret;
		}
	}

3289
	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
3290
	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3291
		btrfs_info(fs_info, "creating free space tree");
3292 3293
		ret = btrfs_create_free_space_tree(fs_info);
		if (ret) {
3294 3295
			btrfs_warn(fs_info,
				"failed to create free space tree: %d", ret);
3296
			close_ctree(fs_info);
3297 3298 3299 3300
			return ret;
		}
	}

3301 3302 3303
	down_read(&fs_info->cleanup_work_sem);
	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
3304
		up_read(&fs_info->cleanup_work_sem);
3305
		close_ctree(fs_info);
3306 3307 3308
		return ret;
	}
	up_read(&fs_info->cleanup_work_sem);
I
Ilya Dryomov 已提交
3309

3310 3311
	ret = btrfs_resume_balance_async(fs_info);
	if (ret) {
3312
		btrfs_warn(fs_info, "failed to resume balance: %d", ret);
3313
		close_ctree(fs_info);
3314
		return ret;
3315 3316
	}

3317 3318
	ret = btrfs_resume_dev_replace_async(fs_info);
	if (ret) {
3319
		btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
3320
		close_ctree(fs_info);
3321 3322 3323
		return ret;
	}

3324
	btrfs_qgroup_rescan_resume(fs_info);
3325
	btrfs_discard_resume(fs_info);
3326

3327
	if (!fs_info->uuid_root) {
3328
		btrfs_info(fs_info, "creating UUID tree");
3329 3330
		ret = btrfs_create_uuid_tree(fs_info);
		if (ret) {
3331 3332
			btrfs_warn(fs_info,
				"failed to create the UUID tree: %d", ret);
3333
			close_ctree(fs_info);
3334 3335
			return ret;
		}
3336
	} else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
3337 3338
		   fs_info->generation !=
				btrfs_super_uuid_tree_generation(disk_super)) {
3339
		btrfs_info(fs_info, "checking UUID tree");
3340 3341
		ret = btrfs_check_uuid_tree(fs_info);
		if (ret) {
3342 3343
			btrfs_warn(fs_info,
				"failed to check the UUID tree: %d", ret);
3344
			close_ctree(fs_info);
3345 3346 3347
			return ret;
		}
	} else {
3348
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
3349
	}
3350
	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3351

3352 3353 3354 3355 3356 3357
	/*
	 * backuproot only affect mount behavior, and if open_ctree succeeded,
	 * no need to keep the flag
	 */
	btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);

A
Al Viro 已提交
3358
	return 0;
C
Chris Mason 已提交
3359

3360 3361
fail_qgroup:
	btrfs_free_qgroup_config(fs_info);
3362 3363
fail_trans_kthread:
	kthread_stop(fs_info->transaction_kthread);
3364
	btrfs_cleanup_transaction(fs_info);
3365
	btrfs_free_fs_roots(fs_info);
3366
fail_cleaner:
3367
	kthread_stop(fs_info->cleaner_kthread);
3368 3369 3370 3371 3372 3373 3374

	/*
	 * make sure we're done with the btree inode before we stop our
	 * kthreads
	 */
	filemap_write_and_wait(fs_info->btree_inode->i_mapping);

3375
fail_sysfs:
3376
	btrfs_sysfs_remove_mounted(fs_info);
3377

3378 3379 3380
fail_fsdev_sysfs:
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);

3381
fail_block_groups:
J
Josef Bacik 已提交
3382
	btrfs_put_block_group_cache(fs_info);
C
Chris Mason 已提交
3383 3384

fail_tree_roots:
3385
	free_root_pointers(fs_info, true);
3386
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
C
Chris Mason 已提交
3387

C
Chris Mason 已提交
3388
fail_sb_buffer:
L
Liu Bo 已提交
3389
	btrfs_stop_all_workers(fs_info);
3390
	btrfs_free_block_groups(fs_info);
3391
fail_alloc:
3392 3393
	btrfs_mapping_tree_free(&fs_info->mapping_tree);

3394
	iput(fs_info->btree_inode);
3395 3396
fail_srcu:
	cleanup_srcu_struct(&fs_info->subvol_srcu);
3397
fail:
3398
	btrfs_close_devices(fs_info->fs_devices);
A
Al Viro 已提交
3399
	return err;
3400
}
3401
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3402

3403 3404 3405 3406 3407
static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
{
	if (uptodate) {
		set_buffer_uptodate(bh);
	} else {
3408 3409 3410
		struct btrfs_device *device = (struct btrfs_device *)
			bh->b_private;

3411
		btrfs_warn_rl_in_rcu(device->fs_info,
3412
				"lost page write due to IO error on %s",
3413
					  rcu_str_deref(device->name));
3414
		/* note, we don't set_buffer_write_io_error because we have
3415 3416
		 * our own ways of dealing with the IO errors
		 */
3417
		clear_buffer_uptodate(bh);
3418
		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
3419 3420 3421 3422 3423
	}
	unlock_buffer(bh);
	put_bh(bh);
}

3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
			struct buffer_head **bh_ret)
{
	struct buffer_head *bh;
	struct btrfs_super_block *super;
	u64 bytenr;

	bytenr = btrfs_sb_offset(copy_num);
	if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
		return -EINVAL;

3435
	bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE);
3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454
	/*
	 * If we fail to read from the underlying devices, as of now
	 * the best option we have is to mark it EIO.
	 */
	if (!bh)
		return -EIO;

	super = (struct btrfs_super_block *)bh->b_data;
	if (btrfs_super_bytenr(super) != bytenr ||
		    btrfs_super_magic(super) != BTRFS_MAGIC) {
		brelse(bh);
		return -EINVAL;
	}

	*bh_ret = bh;
	return 0;
}


Y
Yan Zheng 已提交
3455 3456 3457 3458 3459 3460 3461
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
	struct buffer_head *bh;
	struct buffer_head *latest = NULL;
	struct btrfs_super_block *super;
	int i;
	u64 transid = 0;
3462
	int ret = -EINVAL;
Y
Yan Zheng 已提交
3463 3464 3465 3466 3467 3468 3469

	/* we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	for (i = 0; i < 1; i++) {
3470 3471
		ret = btrfs_read_dev_one_super(bdev, i, &bh);
		if (ret)
Y
Yan Zheng 已提交
3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483
			continue;

		super = (struct btrfs_super_block *)bh->b_data;

		if (!latest || btrfs_super_generation(super) > transid) {
			brelse(latest);
			latest = bh;
			transid = btrfs_super_generation(super);
		} else {
			brelse(bh);
		}
	}
3484 3485 3486 3487

	if (!latest)
		return ERR_PTR(ret);

Y
Yan Zheng 已提交
3488 3489 3490
	return latest;
}

3491
/*
3492 3493
 * Write superblock @sb to the @device. Do not wait for completion, all the
 * buffer heads we write are pinned.
3494
 *
3495 3496 3497
 * Write @max_mirrors copies of the superblock, where 0 means default that fit
 * the expected device size at commit time. Note that max_mirrors must be
 * same for write and wait phases.
3498
 *
3499
 * Return number of errors when buffer head is not found or submission fails.
3500
 */
Y
Yan Zheng 已提交
3501
static int write_dev_supers(struct btrfs_device *device,
3502
			    struct btrfs_super_block *sb, int max_mirrors)
Y
Yan Zheng 已提交
3503
{
3504 3505
	struct btrfs_fs_info *fs_info = device->fs_info;
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Y
Yan Zheng 已提交
3506 3507 3508 3509 3510
	struct buffer_head *bh;
	int i;
	int ret;
	int errors = 0;
	u64 bytenr;
3511
	int op_flags;
Y
Yan Zheng 已提交
3512 3513 3514 3515

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

3516 3517
	shash->tfm = fs_info->csum_shash;

Y
Yan Zheng 已提交
3518 3519
	for (i = 0; i < max_mirrors; i++) {
		bytenr = btrfs_sb_offset(i);
3520 3521
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
Y
Yan Zheng 已提交
3522 3523
			break;

3524
		btrfs_set_super_bytenr(sb, bytenr);
3525

3526 3527 3528 3529
		crypto_shash_init(shash);
		crypto_shash_update(shash, (const char *)sb + BTRFS_CSUM_SIZE,
				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
		crypto_shash_final(shash, sb->csum);
3530

3531
		/* One reference for us, and we leave it for the caller */
3532
		bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE,
3533 3534 3535 3536 3537 3538
			      BTRFS_SUPER_INFO_SIZE);
		if (!bh) {
			btrfs_err(device->fs_info,
			    "couldn't get super buffer head for bytenr %llu",
			    bytenr);
			errors++;
3539
			continue;
3540
		}
3541

3542
		memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
Y
Yan Zheng 已提交
3543

3544 3545
		/* one reference for submit_bh */
		get_bh(bh);
3546

3547 3548 3549 3550
		set_buffer_uptodate(bh);
		lock_buffer(bh);
		bh->b_end_io = btrfs_end_buffer_write_sync;
		bh->b_private = device;
Y
Yan Zheng 已提交
3551

C
Chris Mason 已提交
3552 3553 3554 3555
		/*
		 * we fua the first super.  The others we allow
		 * to go down lazy.
		 */
3556 3557 3558 3559
		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
			op_flags |= REQ_FUA;
		ret = btrfsic_submit_bh(REQ_OP_WRITE, op_flags, bh);
3560
		if (ret)
Y
Yan Zheng 已提交
3561 3562 3563 3564 3565
			errors++;
	}
	return errors < i ? 0 : -1;
}

3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577
/*
 * Wait for write completion of superblocks done by write_dev_supers,
 * @max_mirrors same for write and wait phases.
 *
 * Return number of errors when buffer head is not found or not marked up to
 * date.
 */
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
	struct buffer_head *bh;
	int i;
	int errors = 0;
3578
	bool primary_failed = false;
3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589
	u64 bytenr;

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

	for (i = 0; i < max_mirrors; i++) {
		bytenr = btrfs_sb_offset(i);
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
			break;

3590 3591
		bh = __find_get_block(device->bdev,
				      bytenr / BTRFS_BDEV_BLOCKSIZE,
3592 3593 3594
				      BTRFS_SUPER_INFO_SIZE);
		if (!bh) {
			errors++;
3595 3596
			if (i == 0)
				primary_failed = true;
3597 3598 3599
			continue;
		}
		wait_on_buffer(bh);
3600
		if (!buffer_uptodate(bh)) {
3601
			errors++;
3602 3603 3604
			if (i == 0)
				primary_failed = true;
		}
3605 3606 3607 3608 3609 3610 3611 3612

		/* drop our reference */
		brelse(bh);

		/* drop the reference from the writing run */
		brelse(bh);
	}

3613 3614 3615 3616 3617 3618 3619
	/* log error, force error return */
	if (primary_failed) {
		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
			  device->devid);
		return -1;
	}

3620 3621 3622
	return errors < i ? 0 : -1;
}

C
Chris Mason 已提交
3623 3624 3625 3626
/*
 * endio for the write_dev_flush, this will wake anyone waiting
 * for the barrier when it is done
 */
3627
static void btrfs_end_empty_barrier(struct bio *bio)
C
Chris Mason 已提交
3628
{
3629
	complete(bio->bi_private);
C
Chris Mason 已提交
3630 3631 3632
}

/*
3633 3634
 * Submit a flush request to the device if it supports it. Error handling is
 * done in the waiting counterpart.
C
Chris Mason 已提交
3635
 */
3636
static void write_dev_flush(struct btrfs_device *device)
C
Chris Mason 已提交
3637
{
3638
	struct request_queue *q = bdev_get_queue(device->bdev);
3639
	struct bio *bio = device->flush_bio;
C
Chris Mason 已提交
3640

3641
	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
3642
		return;
C
Chris Mason 已提交
3643

3644
	bio_reset(bio);
C
Chris Mason 已提交
3645
	bio->bi_end_io = btrfs_end_empty_barrier;
3646
	bio_set_dev(bio, device->bdev);
3647
	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
C
Chris Mason 已提交
3648 3649 3650
	init_completion(&device->flush_wait);
	bio->bi_private = &device->flush_wait;

3651
	btrfsic_submit_bio(bio);
3652
	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3653
}
C
Chris Mason 已提交
3654

3655 3656 3657
/*
 * If the flush bio has been submitted by write_dev_flush, wait for it.
 */
3658
static blk_status_t wait_dev_flush(struct btrfs_device *device)
3659 3660
{
	struct bio *bio = device->flush_bio;
C
Chris Mason 已提交
3661

3662
	if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3663
		return BLK_STS_OK;
C
Chris Mason 已提交
3664

3665
	clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3666
	wait_for_completion_io(&device->flush_wait);
C
Chris Mason 已提交
3667

3668
	return bio->bi_status;
C
Chris Mason 已提交
3669 3670
}

3671
static int check_barrier_error(struct btrfs_fs_info *fs_info)
3672
{
3673
	if (!btrfs_check_rw_degradable(fs_info, NULL))
3674
		return -EIO;
C
Chris Mason 已提交
3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685
	return 0;
}

/*
 * send an empty flush down to each device in parallel,
 * then wait for them
 */
static int barrier_all_devices(struct btrfs_fs_info *info)
{
	struct list_head *head;
	struct btrfs_device *dev;
3686
	int errors_wait = 0;
3687
	blk_status_t ret;
C
Chris Mason 已提交
3688

3689
	lockdep_assert_held(&info->fs_devices->device_list_mutex);
C
Chris Mason 已提交
3690 3691
	/* send down all the barriers */
	head = &info->fs_devices->devices;
3692
	list_for_each_entry(dev, head, dev_list) {
3693
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3694
			continue;
3695
		if (!dev->bdev)
C
Chris Mason 已提交
3696
			continue;
3697
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3698
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
C
Chris Mason 已提交
3699 3700
			continue;

3701
		write_dev_flush(dev);
3702
		dev->last_flush_error = BLK_STS_OK;
C
Chris Mason 已提交
3703 3704 3705
	}

	/* wait for all the barriers */
3706
	list_for_each_entry(dev, head, dev_list) {
3707
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3708
			continue;
C
Chris Mason 已提交
3709
		if (!dev->bdev) {
3710
			errors_wait++;
C
Chris Mason 已提交
3711 3712
			continue;
		}
3713
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3714
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
C
Chris Mason 已提交
3715 3716
			continue;

3717
		ret = wait_dev_flush(dev);
3718 3719
		if (ret) {
			dev->last_flush_error = ret;
3720 3721
			btrfs_dev_stat_inc_and_print(dev,
					BTRFS_DEV_STAT_FLUSH_ERRS);
3722
			errors_wait++;
3723 3724 3725
		}
	}

3726
	if (errors_wait) {
3727 3728 3729 3730 3731
		/*
		 * At some point we need the status of all disks
		 * to arrive at the volume status. So error checking
		 * is being pushed to a separate loop.
		 */
3732
		return check_barrier_error(info);
C
Chris Mason 已提交
3733 3734 3735 3736
	}
	return 0;
}

3737 3738
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
3739 3740
	int raid_type;
	int min_tolerated = INT_MAX;
3741

3742 3743
	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3744
		min_tolerated = min_t(int, min_tolerated,
3745 3746
				    btrfs_raid_array[BTRFS_RAID_SINGLE].
				    tolerated_failures);
3747

3748 3749 3750
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (raid_type == BTRFS_RAID_SINGLE)
			continue;
3751
		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
3752
			continue;
3753
		min_tolerated = min_t(int, min_tolerated,
3754 3755 3756
				    btrfs_raid_array[raid_type].
				    tolerated_failures);
	}
3757

3758
	if (min_tolerated == INT_MAX) {
3759
		pr_warn("BTRFS: unknown raid flag: %llu", flags);
3760 3761 3762 3763
		min_tolerated = 0;
	}

	return min_tolerated;
3764 3765
}

3766
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
3767
{
3768
	struct list_head *head;
3769
	struct btrfs_device *dev;
3770
	struct btrfs_super_block *sb;
3771 3772 3773
	struct btrfs_dev_item *dev_item;
	int ret;
	int do_barriers;
3774 3775
	int max_errors;
	int total_errors = 0;
3776
	u64 flags;
3777

3778
	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
3779 3780 3781 3782 3783 3784 3785 3786

	/*
	 * max_mirrors == 0 indicates we're from commit_transaction,
	 * not from fsync where the tree roots in fs_info have not
	 * been consistent on disk.
	 */
	if (max_mirrors == 0)
		backup_super_roots(fs_info);
3787

3788
	sb = fs_info->super_for_commit;
3789
	dev_item = &sb->dev_item;
3790

3791 3792 3793
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	head = &fs_info->fs_devices->devices;
	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
C
Chris Mason 已提交
3794

3795
	if (do_barriers) {
3796
		ret = barrier_all_devices(fs_info);
3797 3798
		if (ret) {
			mutex_unlock(
3799 3800 3801
				&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, ret,
					      "errors while submitting device barriers.");
3802 3803 3804
			return ret;
		}
	}
C
Chris Mason 已提交
3805

3806
	list_for_each_entry(dev, head, dev_list) {
3807 3808 3809 3810
		if (!dev->bdev) {
			total_errors++;
			continue;
		}
3811
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3812
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3813 3814
			continue;

Y
Yan Zheng 已提交
3815
		btrfs_set_stack_device_generation(dev_item, 0);
3816 3817
		btrfs_set_stack_device_type(dev_item, dev->type);
		btrfs_set_stack_device_id(dev_item, dev->devid);
3818
		btrfs_set_stack_device_total_bytes(dev_item,
3819
						   dev->commit_total_bytes);
3820 3821
		btrfs_set_stack_device_bytes_used(dev_item,
						  dev->commit_bytes_used);
3822 3823 3824 3825
		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
3826 3827
		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
		       BTRFS_FSID_SIZE);
Y
Yan Zheng 已提交
3828

3829 3830 3831
		flags = btrfs_super_flags(sb);
		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);

3832 3833 3834 3835 3836 3837 3838 3839
		ret = btrfs_validate_write_super(fs_info, sb);
		if (ret < 0) {
			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, -EUCLEAN,
				"unexpected superblock corruption detected");
			return -EUCLEAN;
		}

3840
		ret = write_dev_supers(dev, sb, max_mirrors);
3841 3842
		if (ret)
			total_errors++;
3843
	}
3844
	if (total_errors > max_errors) {
3845 3846 3847
		btrfs_err(fs_info, "%d errors while writing supers",
			  total_errors);
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3848

3849
		/* FUA is masked off if unsupported and can't be the reason */
3850 3851 3852
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
3853
		return -EIO;
3854
	}
3855

Y
Yan Zheng 已提交
3856
	total_errors = 0;
3857
	list_for_each_entry(dev, head, dev_list) {
3858 3859
		if (!dev->bdev)
			continue;
3860
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3861
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
3862 3863
			continue;

3864
		ret = wait_dev_supers(dev, max_mirrors);
Y
Yan Zheng 已提交
3865 3866
		if (ret)
			total_errors++;
3867
	}
3868
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3869
	if (total_errors > max_errors) {
3870 3871 3872
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
3873
		return -EIO;
3874
	}
3875 3876 3877
	return 0;
}

3878 3879 3880
/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				  struct btrfs_root *root)
C
Chris Mason 已提交
3881
{
3882
	spin_lock(&fs_info->fs_roots_radix_lock);
C
Chris Mason 已提交
3883 3884
	radix_tree_delete(&fs_info->fs_roots_radix,
			  (unsigned long)root->root_key.objectid);
3885
	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
3886
		btrfs_put_root(root);
3887
	spin_unlock(&fs_info->fs_roots_radix_lock);
3888 3889 3890 3891

	if (btrfs_root_refs(&root->root_item) == 0)
		synchronize_srcu(&fs_info->subvol_srcu);

L
Liu Bo 已提交
3892
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
L
Liu Bo 已提交
3893
		btrfs_free_log(NULL, root);
L
Liu Bo 已提交
3894 3895 3896
		if (root->reloc_root) {
			free_extent_buffer(root->reloc_root->node);
			free_extent_buffer(root->reloc_root->commit_root);
3897
			btrfs_put_root(root->reloc_root);
L
Liu Bo 已提交
3898 3899 3900
			root->reloc_root = NULL;
		}
	}
L
Liu Bo 已提交
3901

3902 3903 3904 3905
	if (root->free_ino_pinned)
		__btrfs_remove_free_space_cache(root->free_ino_pinned);
	if (root->free_ino_ctl)
		__btrfs_remove_free_space_cache(root->free_ino_ctl);
D
David Sterba 已提交
3906
	btrfs_free_fs_root(root);
3907 3908
}

D
David Sterba 已提交
3909
void btrfs_free_fs_root(struct btrfs_root *root)
3910
{
3911
	iput(root->ino_cache_inode);
3912
	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
3913 3914
	if (root->anon_dev)
		free_anon_bdev(root->anon_dev);
3915 3916
	if (root->subv_writers)
		btrfs_free_subvolume_writers(root->subv_writers);
3917 3918
	free_extent_buffer(root->node);
	free_extent_buffer(root->commit_root);
3919 3920
	kfree(root->free_ino_ctl);
	kfree(root->free_ino_pinned);
3921
	btrfs_put_root(root);
C
Chris Mason 已提交
3922 3923
}

Y
Yan Zheng 已提交
3924
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
3925
{
Y
Yan Zheng 已提交
3926 3927
	u64 root_objectid = 0;
	struct btrfs_root *gang[8];
3928 3929 3930 3931
	int i = 0;
	int err = 0;
	unsigned int ret = 0;
	int index;
3932

Y
Yan Zheng 已提交
3933
	while (1) {
3934
		index = srcu_read_lock(&fs_info->subvol_srcu);
Y
Yan Zheng 已提交
3935 3936 3937
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang));
3938 3939
		if (!ret) {
			srcu_read_unlock(&fs_info->subvol_srcu, index);
Y
Yan Zheng 已提交
3940
			break;
3941
		}
3942
		root_objectid = gang[ret - 1]->root_key.objectid + 1;
3943

Y
Yan Zheng 已提交
3944
		for (i = 0; i < ret; i++) {
3945 3946 3947 3948 3949 3950
			/* Avoid to grab roots in dead_roots */
			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				gang[i] = NULL;
				continue;
			}
			/* grab all the search result for later use */
3951
			gang[i] = btrfs_grab_root(gang[i]);
3952 3953
		}
		srcu_read_unlock(&fs_info->subvol_srcu, index);
3954

3955 3956 3957
		for (i = 0; i < ret; i++) {
			if (!gang[i])
				continue;
Y
Yan Zheng 已提交
3958
			root_objectid = gang[i]->root_key.objectid;
3959 3960
			err = btrfs_orphan_cleanup(gang[i]);
			if (err)
3961
				break;
3962
			btrfs_put_root(gang[i]);
Y
Yan Zheng 已提交
3963 3964 3965
		}
		root_objectid++;
	}
3966 3967 3968 3969

	/* release the uncleaned roots due to error */
	for (; i < ret; i++) {
		if (gang[i])
3970
			btrfs_put_root(gang[i]);
3971 3972
	}
	return err;
Y
Yan Zheng 已提交
3973
}
3974

3975
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3976
{
3977
	struct btrfs_root *root = fs_info->tree_root;
Y
Yan Zheng 已提交
3978
	struct btrfs_trans_handle *trans;
3979

3980
	mutex_lock(&fs_info->cleaner_mutex);
3981
	btrfs_run_delayed_iputs(fs_info);
3982 3983
	mutex_unlock(&fs_info->cleaner_mutex);
	wake_up_process(fs_info->cleaner_kthread);
3984 3985

	/* wait until ongoing cleanup work done */
3986 3987
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
3988

3989
	trans = btrfs_join_transaction(root);
3990 3991
	if (IS_ERR(trans))
		return PTR_ERR(trans);
3992
	return btrfs_commit_transaction(trans);
Y
Yan Zheng 已提交
3993 3994
}

3995
void __cold close_ctree(struct btrfs_fs_info *fs_info)
Y
Yan Zheng 已提交
3996 3997 3998
{
	int ret;

3999
	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4000 4001 4002 4003 4004 4005 4006
	/*
	 * We don't want the cleaner to start new transactions, add more delayed
	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
	 * because that frees the task_struct, and the transaction kthread might
	 * still try to wake up the cleaner.
	 */
	kthread_park(fs_info->cleaner_kthread);
Y
Yan Zheng 已提交
4007

4008
	/* wait for the qgroup rescan worker to stop */
4009
	btrfs_qgroup_wait_for_completion(fs_info, false);
4010

S
Stefan Behrens 已提交
4011 4012 4013 4014 4015
	/* wait for the uuid_scan task to finish */
	down(&fs_info->uuid_tree_rescan_sem);
	/* avoid complains from lockdep et al., set sem back to initial state */
	up(&fs_info->uuid_tree_rescan_sem);

4016
	/* pause restriper - we want to resume on mount */
4017
	btrfs_pause_balance(fs_info);
4018

4019 4020
	btrfs_dev_replace_suspend_for_unmount(fs_info);

4021
	btrfs_scrub_cancel(fs_info);
C
Chris Mason 已提交
4022 4023 4024 4025 4026 4027

	/* wait for any defraggers to finish */
	wait_event(fs_info->transaction_wait,
		   (atomic_read(&fs_info->defrag_running) == 0));

	/* clear out the rbtree of defraggable inodes */
4028
	btrfs_cleanup_defrag_inodes(fs_info);
C
Chris Mason 已提交
4029

4030 4031
	cancel_work_sync(&fs_info->async_reclaim_work);

4032 4033 4034
	/* Cancel or finish ongoing discard work */
	btrfs_discard_cleanup(fs_info);

4035
	if (!sb_rdonly(fs_info->sb)) {
4036
		/*
4037 4038
		 * The cleaner kthread is stopped, so do one final pass over
		 * unused block groups.
4039
		 */
4040
		btrfs_delete_unused_bgs(fs_info);
4041

4042
		ret = btrfs_commit_super(fs_info);
L
liubo 已提交
4043
		if (ret)
4044
			btrfs_err(fs_info, "commit super ret %d", ret);
L
liubo 已提交
4045 4046
	}

4047 4048
	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) ||
	    test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state))
4049
		btrfs_error_commit_super(fs_info);
4050

A
Al Viro 已提交
4051 4052
	kthread_stop(fs_info->transaction_kthread);
	kthread_stop(fs_info->cleaner_kthread);
4053

4054
	ASSERT(list_empty(&fs_info->delayed_iputs));
4055
	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4056

4057
	btrfs_free_qgroup_config(fs_info);
4058
	ASSERT(list_empty(&fs_info->delalloc_roots));
4059

4060
	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4061
		btrfs_info(fs_info, "at unmount delalloc count %lld",
4062
		       percpu_counter_sum(&fs_info->delalloc_bytes));
C
Chris Mason 已提交
4063
	}
4064

J
Josef Bacik 已提交
4065 4066 4067 4068
	if (percpu_counter_sum(&fs_info->dio_bytes))
		btrfs_info(fs_info, "at unmount dio bytes count %lld",
			   percpu_counter_sum(&fs_info->dio_bytes));

4069
	btrfs_sysfs_remove_mounted(fs_info);
4070
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4071

4072
	btrfs_free_fs_roots(fs_info);
4073

4074 4075
	btrfs_put_block_group_cache(fs_info);

4076 4077 4078 4079 4080
	/*
	 * we must make sure there is not any read request to
	 * submit after we stopping all workers.
	 */
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4081 4082
	btrfs_stop_all_workers(fs_info);

4083
	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4084
	free_root_pointers(fs_info, true);
4085

4086 4087 4088 4089 4090 4091 4092 4093 4094
	/*
	 * We must free the block groups after dropping the fs_roots as we could
	 * have had an IO error and have left over tree log blocks that aren't
	 * cleaned up until the fs roots are freed.  This makes the block group
	 * accounting appear to be wrong because there's pending reserved bytes,
	 * so make sure we do the block group cleanup afterwards.
	 */
	btrfs_free_block_groups(fs_info);

4095
	iput(fs_info->btree_inode);
4096

4097
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4098
	if (btrfs_test_opt(fs_info, CHECK_INTEGRITY))
4099
		btrfsic_unmount(fs_info->fs_devices);
4100 4101
#endif

4102
	btrfs_mapping_tree_free(&fs_info->mapping_tree);
4103
	btrfs_close_devices(fs_info->fs_devices);
4104
	cleanup_srcu_struct(&fs_info->subvol_srcu);
4105 4106
}

4107 4108
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
			  int atomic)
4109
{
4110
	int ret;
4111
	struct inode *btree_inode = buf->pages[0]->mapping->host;
4112

4113
	ret = extent_buffer_uptodate(buf);
4114 4115 4116 4117
	if (!ret)
		return ret;

	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
4118 4119 4120
				    parent_transid, atomic);
	if (ret == -EAGAIN)
		return ret;
4121
	return !ret;
4122 4123 4124 4125
}

void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
{
4126
	struct btrfs_fs_info *fs_info;
4127
	struct btrfs_root *root;
4128
	u64 transid = btrfs_header_generation(buf);
4129
	int was_dirty;
4130

4131 4132 4133
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
	/*
	 * This is a fast path so only do this check if we have sanity tests
4134
	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4135 4136
	 * outside of the sanity tests.
	 */
4137
	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4138 4139 4140
		return;
#endif
	root = BTRFS_I(buf->pages[0]->mapping->host)->root;
4141
	fs_info = root->fs_info;
4142
	btrfs_assert_tree_locked(buf);
4143
	if (transid != fs_info->generation)
J
Jeff Mahoney 已提交
4144
		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n",
4145
			buf->start, transid, fs_info->generation);
4146
	was_dirty = set_extent_buffer_dirty(buf);
4147
	if (!was_dirty)
4148 4149 4150
		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
					 buf->len,
					 fs_info->dirty_metadata_batch);
4151
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
4152 4153 4154 4155 4156 4157
	/*
	 * Since btrfs_mark_buffer_dirty() can be called with item pointer set
	 * but item data not updated.
	 * So here we should only check item pointers, not item data.
	 */
	if (btrfs_header_level(buf) == 0 &&
4158
	    btrfs_check_leaf_relaxed(buf)) {
4159
		btrfs_print_leaf(buf);
4160 4161 4162
		ASSERT(0);
	}
#endif
4163 4164
}

4165
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4166
					int flush_delayed)
4167 4168 4169 4170 4171
{
	/*
	 * looks as though older kernels can get into trouble with
	 * this code, they end up stuck in balance_dirty_pages forever
	 */
4172
	int ret;
4173 4174 4175 4176

	if (current->flags & PF_MEMALLOC)
		return;

4177
	if (flush_delayed)
4178
		btrfs_balance_delayed_items(fs_info);
4179

4180 4181 4182
	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				     BTRFS_DIRTY_METADATA_THRESH,
				     fs_info->dirty_metadata_batch);
4183
	if (ret > 0) {
4184
		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4185 4186 4187
	}
}

4188
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
C
Chris Mason 已提交
4189
{
4190
	__btrfs_btree_balance_dirty(fs_info, 1);
4191
}
4192

4193
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4194
{
4195
	__btrfs_btree_balance_dirty(fs_info, 0);
C
Chris Mason 已提交
4196
}
4197

4198 4199
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
		      struct btrfs_key *first_key)
4200
{
4201
	return btree_read_extent_buffer_pages(buf, parent_transid,
4202
					      level, first_key);
4203
}
4204

4205
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
L
liubo 已提交
4206
{
4207 4208 4209
	/* cleanup FS via transaction */
	btrfs_cleanup_transaction(fs_info);

4210
	mutex_lock(&fs_info->cleaner_mutex);
4211
	btrfs_run_delayed_iputs(fs_info);
4212
	mutex_unlock(&fs_info->cleaner_mutex);
L
liubo 已提交
4213

4214 4215
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
L
liubo 已提交
4216 4217
}

4218
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
L
liubo 已提交
4219 4220 4221
{
	struct btrfs_ordered_extent *ordered;

4222
	spin_lock(&root->ordered_extent_lock);
4223 4224 4225 4226
	/*
	 * This will just short circuit the ordered completion stuff which will
	 * make sure the ordered extent gets properly cleaned up.
	 */
4227
	list_for_each_entry(ordered, &root->ordered_extents,
4228 4229
			    root_extent_list)
		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244
	spin_unlock(&root->ordered_extent_lock);
}

static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
4245 4246
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
4247

4248
		spin_unlock(&fs_info->ordered_root_lock);
4249 4250
		btrfs_destroy_ordered_extents(root);

4251 4252
		cond_resched();
		spin_lock(&fs_info->ordered_root_lock);
4253 4254
	}
	spin_unlock(&fs_info->ordered_root_lock);
4255 4256 4257 4258 4259 4260 4261 4262

	/*
	 * We need this here because if we've been flipped read-only we won't
	 * get sync() from the umount, so we need to make sure any ordered
	 * extents that haven't had their dirty pages IO start writeout yet
	 * actually get run and error out properly.
	 */
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
L
liubo 已提交
4263 4264
}

4265
static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
4266
				      struct btrfs_fs_info *fs_info)
L
liubo 已提交
4267 4268 4269 4270 4271 4272 4273 4274 4275
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_node *ref;
	int ret = 0;

	delayed_refs = &trans->delayed_refs;

	spin_lock(&delayed_refs->lock);
4276
	if (atomic_read(&delayed_refs->num_entries) == 0) {
4277
		spin_unlock(&delayed_refs->lock);
4278
		btrfs_info(fs_info, "delayed_refs has NO entry");
L
liubo 已提交
4279 4280 4281
		return ret;
	}

4282
	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4283
		struct btrfs_delayed_ref_head *head;
4284
		struct rb_node *n;
4285
		bool pin_bytes = false;
L
liubo 已提交
4286

4287 4288
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
4289
		if (btrfs_delayed_ref_lock(delayed_refs, head))
4290
			continue;
4291

4292
		spin_lock(&head->lock);
4293
		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4294 4295
			ref = rb_entry(n, struct btrfs_delayed_ref_node,
				       ref_node);
4296
			ref->in_tree = 0;
4297
			rb_erase_cached(&ref->ref_node, &head->ref_tree);
4298
			RB_CLEAR_NODE(&ref->ref_node);
4299 4300
			if (!list_empty(&ref->add_list))
				list_del(&ref->add_list);
4301 4302
			atomic_dec(&delayed_refs->num_entries);
			btrfs_put_delayed_ref(ref);
4303
		}
4304 4305 4306
		if (head->must_insert_reserved)
			pin_bytes = true;
		btrfs_free_delayed_extent_op(head->extent_op);
4307
		btrfs_delete_ref_head(delayed_refs, head);
4308 4309 4310
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		mutex_unlock(&head->mutex);
L
liubo 已提交
4311

4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335
		if (pin_bytes) {
			struct btrfs_block_group *cache;

			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
			BUG_ON(!cache);

			spin_lock(&cache->space_info->lock);
			spin_lock(&cache->lock);
			cache->pinned += head->num_bytes;
			btrfs_space_info_update_bytes_pinned(fs_info,
				cache->space_info, head->num_bytes);
			cache->reserved -= head->num_bytes;
			cache->space_info->bytes_reserved -= head->num_bytes;
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);
			percpu_counter_add_batch(
				&cache->space_info->total_bytes_pinned,
				head->num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);

			btrfs_put_block_group(cache);

			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
				head->bytenr + head->num_bytes - 1);
		}
4336
		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4337
		btrfs_put_delayed_ref_head(head);
L
liubo 已提交
4338 4339 4340
		cond_resched();
		spin_lock(&delayed_refs->lock);
	}
4341
	btrfs_qgroup_destroy_extent_records(trans);
L
liubo 已提交
4342 4343 4344 4345 4346 4347

	spin_unlock(&delayed_refs->lock);

	return ret;
}

4348
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
L
liubo 已提交
4349 4350 4351 4352 4353 4354
{
	struct btrfs_inode *btrfs_inode;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

4355 4356
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
L
liubo 已提交
4357 4358

	while (!list_empty(&splice)) {
4359
		struct inode *inode = NULL;
4360 4361
		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
					       delalloc_inodes);
4362
		__btrfs_del_delalloc_inode(root, btrfs_inode);
4363
		spin_unlock(&root->delalloc_lock);
L
liubo 已提交
4364

4365 4366 4367 4368 4369 4370 4371 4372 4373
		/*
		 * Make sure we get a live inode and that it'll not disappear
		 * meanwhile.
		 */
		inode = igrab(&btrfs_inode->vfs_inode);
		if (inode) {
			invalidate_inode_pages2(inode->i_mapping);
			iput(inode);
		}
4374
		spin_lock(&root->delalloc_lock);
L
liubo 已提交
4375
	}
4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390
	spin_unlock(&root->delalloc_lock);
}

static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct list_head splice;

	INIT_LIST_HEAD(&splice);

	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					 delalloc_root);
4391
		root = btrfs_grab_root(root);
4392 4393 4394 4395
		BUG_ON(!root);
		spin_unlock(&fs_info->delalloc_root_lock);

		btrfs_destroy_delalloc_inodes(root);
4396
		btrfs_put_root(root);
4397 4398 4399 4400

		spin_lock(&fs_info->delalloc_root_lock);
	}
	spin_unlock(&fs_info->delalloc_root_lock);
L
liubo 已提交
4401 4402
}

4403
static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
L
liubo 已提交
4404 4405 4406 4407 4408 4409 4410 4411 4412 4413
					struct extent_io_tree *dirty_pages,
					int mark)
{
	int ret;
	struct extent_buffer *eb;
	u64 start = 0;
	u64 end;

	while (1) {
		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
4414
					    mark, NULL);
L
liubo 已提交
4415 4416 4417
		if (ret)
			break;

4418
		clear_extent_bits(dirty_pages, start, end, mark);
L
liubo 已提交
4419
		while (start <= end) {
4420 4421
			eb = find_extent_buffer(fs_info, start);
			start += fs_info->nodesize;
4422
			if (!eb)
L
liubo 已提交
4423
				continue;
4424
			wait_on_extent_buffer_writeback(eb);
L
liubo 已提交
4425

4426 4427 4428 4429
			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
					       &eb->bflags))
				clear_extent_buffer_dirty(eb);
			free_extent_buffer_stale(eb);
L
liubo 已提交
4430 4431 4432 4433 4434 4435
		}
	}

	return ret;
}

4436
static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
L
liubo 已提交
4437 4438 4439 4440 4441 4442
				       struct extent_io_tree *pinned_extents)
{
	struct extent_io_tree *unpin;
	u64 start;
	u64 end;
	int ret;
4443
	bool loop = true;
L
liubo 已提交
4444 4445

	unpin = pinned_extents;
4446
again:
L
liubo 已提交
4447
	while (1) {
4448 4449
		struct extent_state *cached_state = NULL;

4450 4451 4452 4453 4454 4455 4456
		/*
		 * The btrfs_finish_extent_commit() may get the same range as
		 * ours between find_first_extent_bit and clear_extent_dirty.
		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
		 * the same extent range.
		 */
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4457
		ret = find_first_extent_bit(unpin, 0, &start, &end,
4458
					    EXTENT_DIRTY, &cached_state);
4459 4460
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4461
			break;
4462
		}
L
liubo 已提交
4463

4464 4465
		clear_extent_dirty(unpin, start, end, &cached_state);
		free_extent_state(cached_state);
4466
		btrfs_error_unpin_extent_range(fs_info, start, end);
4467
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
L
liubo 已提交
4468 4469 4470
		cond_resched();
	}

4471
	if (loop) {
4472 4473
		if (unpin == &fs_info->freed_extents[0])
			unpin = &fs_info->freed_extents[1];
4474
		else
4475
			unpin = &fs_info->freed_extents[0];
4476 4477 4478 4479
		loop = false;
		goto again;
	}

L
liubo 已提交
4480 4481 4482
	return 0;
}

4483
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497
{
	struct inode *inode;

	inode = cache->io_ctl.inode;
	if (inode) {
		invalidate_inode_pages2(inode->i_mapping);
		BTRFS_I(inode)->generation = 0;
		cache->io_ctl.inode = NULL;
		iput(inode);
	}
	btrfs_put_block_group(cache);
}

void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4498
			     struct btrfs_fs_info *fs_info)
4499
{
4500
	struct btrfs_block_group *cache;
4501 4502 4503 4504

	spin_lock(&cur_trans->dirty_bgs_lock);
	while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
4505
					 struct btrfs_block_group,
4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521
					 dirty_list);

		if (!list_empty(&cache->io_list)) {
			spin_unlock(&cur_trans->dirty_bgs_lock);
			list_del_init(&cache->io_list);
			btrfs_cleanup_bg_io(cache);
			spin_lock(&cur_trans->dirty_bgs_lock);
		}

		list_del_init(&cache->dirty_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);

		spin_unlock(&cur_trans->dirty_bgs_lock);
		btrfs_put_block_group(cache);
J
Josef Bacik 已提交
4522
		btrfs_delayed_refs_rsv_release(fs_info, 1);
4523 4524 4525 4526
		spin_lock(&cur_trans->dirty_bgs_lock);
	}
	spin_unlock(&cur_trans->dirty_bgs_lock);

4527 4528 4529 4530
	/*
	 * Refer to the definition of io_bgs member for details why it's safe
	 * to use it without any locking
	 */
4531 4532
	while (!list_empty(&cur_trans->io_bgs)) {
		cache = list_first_entry(&cur_trans->io_bgs,
4533
					 struct btrfs_block_group,
4534 4535 4536 4537 4538 4539 4540 4541 4542 4543
					 io_list);

		list_del_init(&cache->io_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);
		btrfs_cleanup_bg_io(cache);
	}
}

4544
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4545
				   struct btrfs_fs_info *fs_info)
4546
{
4547 4548
	struct btrfs_device *dev, *tmp;

4549
	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4550 4551 4552
	ASSERT(list_empty(&cur_trans->dirty_bgs));
	ASSERT(list_empty(&cur_trans->io_bgs));

4553 4554 4555 4556 4557
	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&dev->post_commit_list);
	}

4558
	btrfs_destroy_delayed_refs(cur_trans, fs_info);
4559

4560
	cur_trans->state = TRANS_STATE_COMMIT_START;
4561
	wake_up(&fs_info->transaction_blocked_wait);
4562

4563
	cur_trans->state = TRANS_STATE_UNBLOCKED;
4564
	wake_up(&fs_info->transaction_wait);
4565

4566
	btrfs_destroy_delayed_inodes(fs_info);
4567

4568
	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4569
				     EXTENT_DIRTY);
4570
	btrfs_destroy_pinned_extent(fs_info,
4571
				    fs_info->pinned_extents);
4572

4573 4574
	cur_trans->state =TRANS_STATE_COMPLETED;
	wake_up(&cur_trans->commit_wait);
4575 4576
}

4577
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
L
liubo 已提交
4578 4579 4580
{
	struct btrfs_transaction *t;

4581
	mutex_lock(&fs_info->transaction_kthread_mutex);
L
liubo 已提交
4582

4583 4584 4585
	spin_lock(&fs_info->trans_lock);
	while (!list_empty(&fs_info->trans_list)) {
		t = list_first_entry(&fs_info->trans_list,
4586 4587
				     struct btrfs_transaction, list);
		if (t->state >= TRANS_STATE_COMMIT_START) {
4588
			refcount_inc(&t->use_count);
4589
			spin_unlock(&fs_info->trans_lock);
4590
			btrfs_wait_for_commit(fs_info, t->transid);
4591
			btrfs_put_transaction(t);
4592
			spin_lock(&fs_info->trans_lock);
4593 4594
			continue;
		}
4595
		if (t == fs_info->running_transaction) {
4596
			t->state = TRANS_STATE_COMMIT_DOING;
4597
			spin_unlock(&fs_info->trans_lock);
4598 4599 4600 4601 4602 4603 4604
			/*
			 * We wait for 0 num_writers since we don't hold a trans
			 * handle open currently for this transaction.
			 */
			wait_event(t->writer_wait,
				   atomic_read(&t->num_writers) == 0);
		} else {
4605
			spin_unlock(&fs_info->trans_lock);
4606
		}
4607
		btrfs_cleanup_one_transaction(t, fs_info);
4608

4609 4610 4611
		spin_lock(&fs_info->trans_lock);
		if (t == fs_info->running_transaction)
			fs_info->running_transaction = NULL;
L
liubo 已提交
4612
		list_del_init(&t->list);
4613
		spin_unlock(&fs_info->trans_lock);
L
liubo 已提交
4614

4615
		btrfs_put_transaction(t);
4616
		trace_btrfs_transaction_commit(fs_info->tree_root);
4617
		spin_lock(&fs_info->trans_lock);
4618
	}
4619 4620
	spin_unlock(&fs_info->trans_lock);
	btrfs_destroy_all_ordered_extents(fs_info);
4621 4622
	btrfs_destroy_delayed_inodes(fs_info);
	btrfs_assert_delayed_root_empty(fs_info);
4623
	btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents);
4624 4625
	btrfs_destroy_all_delalloc_inodes(fs_info);
	mutex_unlock(&fs_info->transaction_kthread_mutex);
L
liubo 已提交
4626 4627 4628 4629

	return 0;
}

4630
static const struct extent_io_ops btree_extent_io_ops = {
4631
	/* mandatory callbacks */
4632
	.submit_bio_hook = btree_submit_bio_hook,
4633
	.readpage_end_io_hook = btree_readpage_end_io_hook,
4634
};