relocation.c 113.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8 9 10
/*
 * Copyright (C) 2009 Oracle.  All rights reserved.
 */

#include <linux/sched.h>
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rbtree.h>
11
#include <linux/slab.h>
12
#include <linux/error-injection.h>
13 14 15 16 17 18 19
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "locking.h"
#include "btrfs_inode.h"
#include "async-thread.h"
20
#include "free-space-cache.h"
21
#include "qgroup.h"
22
#include "print-tree.h"
23
#include "delalloc-space.h"
24
#include "block-group.h"
25
#include "backref.h"
26
#include "misc.h"
27
#include "subpage.h"
28
#include "zoned.h"
J
Josef Bacik 已提交
29
#include "inode-item.h"
30
#include "space-info.h"
31
#include "fs.h"
32
#include "accessors.h"
33
#include "extent-tree.h"
34
#include "root-tree.h"
35
#include "file-item.h"
36
#include "relocation.h"
37
#include "super.h"
38

39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
/*
 * Relocation overview
 *
 * [What does relocation do]
 *
 * The objective of relocation is to relocate all extents of the target block
 * group to other block groups.
 * This is utilized by resize (shrink only), profile converting, compacting
 * space, or balance routine to spread chunks over devices.
 *
 * 		Before		|		After
 * ------------------------------------------------------------------
 *  BG A: 10 data extents	| BG A: deleted
 *  BG B:  2 data extents	| BG B: 10 data extents (2 old + 8 relocated)
 *  BG C:  1 extents		| BG C:  3 data extents (1 old + 2 relocated)
 *
 * [How does relocation work]
 *
 * 1.   Mark the target block group read-only
 *      New extents won't be allocated from the target block group.
 *
 * 2.1  Record each extent in the target block group
 *      To build a proper map of extents to be relocated.
 *
 * 2.2  Build data reloc tree and reloc trees
 *      Data reloc tree will contain an inode, recording all newly relocated
 *      data extents.
 *      There will be only one data reloc tree for one data block group.
 *
 *      Reloc tree will be a special snapshot of its source tree, containing
 *      relocated tree blocks.
 *      Each tree referring to a tree block in target block group will get its
 *      reloc tree built.
 *
 * 2.3  Swap source tree with its corresponding reloc tree
 *      Each involved tree only refers to new extents after swap.
 *
 * 3.   Cleanup reloc trees and data reloc tree.
 *      As old extents in the target block group are still referenced by reloc
 *      trees, we need to clean them up before really freeing the target block
 *      group.
 *
 * The main complexity is in steps 2.2 and 2.3.
 *
 * The entry point of relocation is relocate_block_group() function.
 */

86
#define RELOCATION_RESERVED_NODES	256
87 88 89 90
/*
 * map address of tree root to tree
 */
struct mapping_node {
91 92 93 94
	struct {
		struct rb_node rb_node;
		u64 bytenr;
	}; /* Use rb_simle_node for search/insert */
95 96 97 98 99 100 101 102 103 104 105 106
	void *data;
};

struct mapping_tree {
	struct rb_root rb_root;
	spinlock_t lock;
};

/*
 * present a tree block to process
 */
struct tree_block {
107 108 109 110
	struct {
		struct rb_node rb_node;
		u64 bytenr;
	}; /* Use rb_simple_node for search/insert */
111
	u64 owner;
112 113 114 115 116
	struct btrfs_key key;
	unsigned int level:8;
	unsigned int key_ready:1;
};

117 118 119 120 121 122 123 124 125
#define MAX_EXTENTS 128

struct file_extent_cluster {
	u64 start;
	u64 end;
	u64 boundary[MAX_EXTENTS];
	unsigned int nr;
};

126 127
struct reloc_control {
	/* block group to relocate */
128
	struct btrfs_block_group *block_group;
129 130 131 132
	/* extent tree */
	struct btrfs_root *extent_root;
	/* inode for moving data */
	struct inode *data_inode;
133 134 135

	struct btrfs_block_rsv *block_rsv;

136
	struct btrfs_backref_cache backref_cache;
137 138

	struct file_extent_cluster cluster;
139 140 141 142 143 144
	/* tree blocks have been processed */
	struct extent_io_tree processed_blocks;
	/* map start of tree root to corresponding reloc tree */
	struct mapping_tree reloc_root_tree;
	/* list of reloc trees */
	struct list_head reloc_roots;
145 146
	/* list of subvolume trees that get relocated */
	struct list_head dirty_subvol_roots;
147 148 149 150
	/* size of metadata reservation for merging reloc trees */
	u64 merging_rsv_size;
	/* size of relocated tree nodes */
	u64 nodes_relocated;
151 152
	/* reserved size for block group relocation*/
	u64 reserved_bytes;
153

154 155
	u64 search_start;
	u64 extents_found;
156 157 158 159

	unsigned int stage:8;
	unsigned int create_reloc_tree:1;
	unsigned int merge_reloc_tree:1;
160 161 162 163 164 165 166
	unsigned int found_file_extent:1;
};

/* stages of data relocation */
#define MOVE_DATA_EXTENTS	0
#define UPDATE_DATA_PTRS	1

167
static void mark_block_processed(struct reloc_control *rc,
168
				 struct btrfs_backref_node *node)
169 170 171 172 173 174 175 176 177 178 179 180 181
{
	u32 blocksize;

	if (node->level == 0 ||
	    in_range(node->bytenr, rc->block_group->start,
		     rc->block_group->length)) {
		blocksize = rc->extent_root->fs_info->nodesize;
		set_extent_bits(&rc->processed_blocks, node->bytenr,
				node->bytenr + blocksize - 1, EXTENT_DIRTY);
	}
	node->processed = 1;
}

182 183 184

static void mapping_tree_init(struct mapping_tree *tree)
{
185
	tree->rb_root = RB_ROOT;
186 187 188 189 190 191
	spin_lock_init(&tree->lock);
}

/*
 * walk up backref nodes until reach node presents tree root
 */
192 193 194
static struct btrfs_backref_node *walk_up_backref(
		struct btrfs_backref_node *node,
		struct btrfs_backref_edge *edges[], int *index)
195
{
196
	struct btrfs_backref_edge *edge;
197 198 199 200
	int idx = *index;

	while (!list_empty(&node->upper)) {
		edge = list_entry(node->upper.next,
201
				  struct btrfs_backref_edge, list[LOWER]);
202 203 204
		edges[idx++] = edge;
		node = edge->node[UPPER];
	}
205
	BUG_ON(node->detached);
206 207 208 209 210 211 212
	*index = idx;
	return node;
}

/*
 * walk down backref nodes to find start of next reference path
 */
213 214
static struct btrfs_backref_node *walk_down_backref(
		struct btrfs_backref_edge *edges[], int *index)
215
{
216 217
	struct btrfs_backref_edge *edge;
	struct btrfs_backref_node *lower;
218 219 220 221 222 223 224 225 226 227
	int idx = *index;

	while (idx > 0) {
		edge = edges[idx - 1];
		lower = edge->node[LOWER];
		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
			idx--;
			continue;
		}
		edge = list_entry(edge->list[LOWER].next,
228
				  struct btrfs_backref_edge, list[LOWER]);
229 230 231 232 233 234 235 236
		edges[idx - 1] = edge;
		*index = idx;
		return edge->node[UPPER];
	}
	*index = 0;
	return NULL;
}

237 238
static void update_backref_node(struct btrfs_backref_cache *cache,
				struct btrfs_backref_node *node, u64 bytenr)
239 240 241 242
{
	struct rb_node *rb_node;
	rb_erase(&node->rb_node, &cache->rb_root);
	node->bytenr = bytenr;
243
	rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
244
	if (rb_node)
245
		btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
246 247 248 249 250 251
}

/*
 * update backref cache after a transaction commit
 */
static int update_backref_cache(struct btrfs_trans_handle *trans,
252
				struct btrfs_backref_cache *cache)
253
{
254
	struct btrfs_backref_node *node;
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271
	int level = 0;

	if (cache->last_trans == 0) {
		cache->last_trans = trans->transid;
		return 0;
	}

	if (cache->last_trans == trans->transid)
		return 0;

	/*
	 * detached nodes are used to avoid unnecessary backref
	 * lookup. transaction commit changes the extent tree.
	 * so the detached nodes are no longer useful.
	 */
	while (!list_empty(&cache->detached)) {
		node = list_entry(cache->detached.next,
272
				  struct btrfs_backref_node, list);
273
		btrfs_backref_cleanup_node(cache, node);
274 275 276 277
	}

	while (!list_empty(&cache->changed)) {
		node = list_entry(cache->changed.next,
278
				  struct btrfs_backref_node, list);
279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
		list_del_init(&node->list);
		BUG_ON(node->pending);
		update_backref_node(cache, node, node->new_bytenr);
	}

	/*
	 * some nodes can be left in the pending list if there were
	 * errors during processing the pending nodes.
	 */
	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
		list_for_each_entry(node, &cache->pending[level], list) {
			BUG_ON(!node->pending);
			if (node->bytenr == node->new_bytenr)
				continue;
			update_backref_node(cache, node, node->new_bytenr);
		}
	}

	cache->last_trans = 0;
	return 1;
}

301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318
static bool reloc_root_is_dead(struct btrfs_root *root)
{
	/*
	 * Pair with set_bit/clear_bit in clean_dirty_subvols and
	 * btrfs_update_reloc_root. We need to see the updated bit before
	 * trying to access reloc_root
	 */
	smp_rmb();
	if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
		return true;
	return false;
}

/*
 * Check if this subvolume tree has valid reloc tree.
 *
 * Reloc tree after swap is considered dead, thus not considered as valid.
 * This is enough for most callers, as they don't distinguish dead reloc root
319 320
 * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
 * special case.
321 322 323 324 325 326 327 328 329
 */
static bool have_reloc_root(struct btrfs_root *root)
{
	if (reloc_root_is_dead(root))
		return false;
	if (!root->reloc_root)
		return false;
	return true;
}
330

331
int btrfs_should_ignore_reloc_root(struct btrfs_root *root)
332 333 334
{
	struct btrfs_root *reloc_root;

335
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
336 337
		return 0;

338 339 340 341
	/* This root has been merged with its reloc tree, we can ignore it */
	if (reloc_root_is_dead(root))
		return 1;

342 343 344 345
	reloc_root = root->reloc_root;
	if (!reloc_root)
		return 0;

346 347
	if (btrfs_header_generation(reloc_root->commit_root) ==
	    root->fs_info->running_transaction->transid)
348 349 350 351 352 353 354 355 356
		return 0;
	/*
	 * if there is reloc tree and it was created in previous
	 * transaction backref lookup can find the reloc tree,
	 * so backref node for the fs tree root is useless for
	 * relocation.
	 */
	return 1;
}
357

358 359 360
/*
 * find reloc tree by address of tree root
 */
361
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
362
{
363
	struct reloc_control *rc = fs_info->reloc_ctl;
364 365 366 367
	struct rb_node *rb_node;
	struct mapping_node *node;
	struct btrfs_root *root = NULL;

368
	ASSERT(rc);
369
	spin_lock(&rc->reloc_root_tree.lock);
370
	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
371 372
	if (rb_node) {
		node = rb_entry(rb_node, struct mapping_node, rb_node);
Y
Yu Zhe 已提交
373
		root = node->data;
374 375
	}
	spin_unlock(&rc->reloc_root_tree.lock);
376
	return btrfs_grab_root(root);
377 378
}

379 380 381 382 383 384 385 386 387 388 389 390 391 392
/*
 * For useless nodes, do two major clean ups:
 *
 * - Cleanup the children edges and nodes
 *   If child node is also orphan (no parent) during cleanup, then the child
 *   node will also be cleaned up.
 *
 * - Freeing up leaves (level 0), keeps nodes detached
 *   For nodes, the node is still cached as "detached"
 *
 * Return false if @node is not in the @useless_nodes list.
 * Return true if @node is in the @useless_nodes list.
 */
static bool handle_useless_nodes(struct reloc_control *rc,
393
				 struct btrfs_backref_node *node)
394
{
395
	struct btrfs_backref_cache *cache = &rc->backref_cache;
396 397 398 399
	struct list_head *useless_node = &cache->useless_node;
	bool ret = false;

	while (!list_empty(useless_node)) {
400
		struct btrfs_backref_node *cur;
401

402
		cur = list_first_entry(useless_node, struct btrfs_backref_node,
403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
				 list);
		list_del_init(&cur->list);

		/* Only tree root nodes can be added to @useless_nodes */
		ASSERT(list_empty(&cur->upper));

		if (cur == node)
			ret = true;

		/* The node is the lowest node */
		if (cur->lowest) {
			list_del_init(&cur->lower);
			cur->lowest = 0;
		}

		/* Cleanup the lower edges */
		while (!list_empty(&cur->lower)) {
420 421
			struct btrfs_backref_edge *edge;
			struct btrfs_backref_node *lower;
422 423

			edge = list_entry(cur->lower.next,
424
					struct btrfs_backref_edge, list[UPPER]);
425 426 427
			list_del(&edge->list[UPPER]);
			list_del(&edge->list[LOWER]);
			lower = edge->node[LOWER];
428
			btrfs_backref_free_edge(cache, edge);
429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446

			/* Child node is also orphan, queue for cleanup */
			if (list_empty(&lower->upper))
				list_add(&lower->list, useless_node);
		}
		/* Mark this block processed for relocation */
		mark_block_processed(rc, cur);

		/*
		 * Backref nodes for tree leaves are deleted from the cache.
		 * Backref nodes for upper level tree blocks are left in the
		 * cache to avoid unnecessary backref lookup.
		 */
		if (cur->level > 0) {
			list_add(&cur->list, &cache->detached);
			cur->detached = 1;
		} else {
			rb_erase(&cur->rb_node, &cache->rb_root);
447
			btrfs_backref_free_node(cache, cur);
448 449 450 451 452
		}
	}
	return ret;
}

453 454 455 456 457 458 459 460 461 462 463 464 465 466
/*
 * Build backref tree for a given tree block. Root of the backref tree
 * corresponds the tree block, leaves of the backref tree correspond roots of
 * b-trees that reference the tree block.
 *
 * The basic idea of this function is check backrefs of a given block to find
 * upper level blocks that reference the block, and then check backrefs of
 * these upper level blocks recursively. The recursion stops when tree root is
 * reached or backrefs for the block is cached.
 *
 * NOTE: if we find that backrefs for a block are cached, we know backrefs for
 * all upper level blocks that directly/indirectly reference the block are also
 * cached.
 */
467
static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
468 469 470 471
			struct reloc_control *rc, struct btrfs_key *node_key,
			int level, u64 bytenr)
{
	struct btrfs_backref_iter *iter;
472
	struct btrfs_backref_cache *cache = &rc->backref_cache;
473 474
	/* For searching parent of TREE_BLOCK_REF */
	struct btrfs_path *path;
475 476 477
	struct btrfs_backref_node *cur;
	struct btrfs_backref_node *node = NULL;
	struct btrfs_backref_edge *edge;
478 479
	int ret;
	int err = 0;
480

481
	iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
482 483 484 485 486 487 488 489
	if (!iter)
		return ERR_PTR(-ENOMEM);
	path = btrfs_alloc_path();
	if (!path) {
		err = -ENOMEM;
		goto out;
	}

490
	node = btrfs_backref_alloc_node(cache, bytenr, level);
491 492 493
	if (!node) {
		err = -ENOMEM;
		goto out;
494 495
	}

496 497 498 499 500
	node->lowest = 1;
	cur = node;

	/* Breadth-first search to build backref cache */
	do {
501 502
		ret = btrfs_backref_add_tree_node(cache, path, iter, node_key,
						  cur);
503 504 505 506 507
		if (ret < 0) {
			err = ret;
			goto out;
		}
		edge = list_first_entry_or_null(&cache->pending_edge,
508
				struct btrfs_backref_edge, list[UPPER]);
509 510 511 512 513 514 515 516 517 518
		/*
		 * The pending list isn't empty, take the first block to
		 * process
		 */
		if (edge) {
			list_del_init(&edge->list[UPPER]);
			cur = edge->node[UPPER];
		}
	} while (edge);

519
	/* Finish the upper linkage of newly added edges/nodes */
520
	ret = btrfs_backref_finish_upper_links(cache, node);
521 522 523
	if (ret < 0) {
		err = ret;
		goto out;
524
	}
525

526 527
	if (handle_useless_nodes(rc, node))
		node = NULL;
528
out:
529 530
	btrfs_backref_iter_free(iter);
	btrfs_free_path(path);
531
	if (err) {
532
		btrfs_backref_error_cleanup(cache, node);
533 534
		return ERR_PTR(err);
	}
535
	ASSERT(!node || !node->detached);
536 537
	ASSERT(list_empty(&cache->useless_node) &&
	       list_empty(&cache->pending_edge));
538 539 540
	return node;
}

541 542 543 544 545 546 547 548 549 550 551
/*
 * helper to add backref node for the newly created snapshot.
 * the backref node is created by cloning backref node that
 * corresponds to root of source tree
 */
static int clone_backref_node(struct btrfs_trans_handle *trans,
			      struct reloc_control *rc,
			      struct btrfs_root *src,
			      struct btrfs_root *dest)
{
	struct btrfs_root *reloc_root = src->reloc_root;
552 553 554 555 556
	struct btrfs_backref_cache *cache = &rc->backref_cache;
	struct btrfs_backref_node *node = NULL;
	struct btrfs_backref_node *new_node;
	struct btrfs_backref_edge *edge;
	struct btrfs_backref_edge *new_edge;
557 558 559 560 561
	struct rb_node *rb_node;

	if (cache->last_trans > 0)
		update_backref_cache(trans, cache);

562
	rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
563
	if (rb_node) {
564
		node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
565 566 567 568 569 570 571
		if (node->detached)
			node = NULL;
		else
			BUG_ON(node->new_bytenr != reloc_root->node->start);
	}

	if (!node) {
572 573
		rb_node = rb_simple_search(&cache->rb_root,
					   reloc_root->commit_root->start);
574
		if (rb_node) {
575
			node = rb_entry(rb_node, struct btrfs_backref_node,
576 577 578 579 580 581 582 583
					rb_node);
			BUG_ON(node->detached);
		}
	}

	if (!node)
		return 0;

584 585
	new_node = btrfs_backref_alloc_node(cache, dest->node->start,
					    node->level);
586 587 588 589
	if (!new_node)
		return -ENOMEM;

	new_node->lowest = node->lowest;
Y
Yan, Zheng 已提交
590
	new_node->checked = 1;
591
	new_node->root = btrfs_grab_root(dest);
592
	ASSERT(new_node->root);
593 594 595

	if (!node->lowest) {
		list_for_each_entry(edge, &node->lower, list[UPPER]) {
596
			new_edge = btrfs_backref_alloc_edge(cache);
597 598 599
			if (!new_edge)
				goto fail;

600 601
			btrfs_backref_link_edge(new_edge, edge->node[LOWER],
						new_node, LINK_UPPER);
602
		}
M
Miao Xie 已提交
603 604
	} else {
		list_add_tail(&new_node->lower, &cache->leaves);
605 606
	}

607 608
	rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
				   &new_node->rb_node);
609
	if (rb_node)
610
		btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
611 612 613 614 615 616 617 618 619 620 621

	if (!new_node->lowest) {
		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
			list_add_tail(&new_edge->list[LOWER],
				      &new_edge->node[LOWER]->upper);
		}
	}
	return 0;
fail:
	while (!list_empty(&new_node->lower)) {
		new_edge = list_entry(new_node->lower.next,
622
				      struct btrfs_backref_edge, list[UPPER]);
623
		list_del(&new_edge->list[UPPER]);
624
		btrfs_backref_free_edge(cache, new_edge);
625
	}
626
	btrfs_backref_free_node(cache, new_node);
627 628 629
	return -ENOMEM;
}

630 631 632
/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
633
static int __must_check __add_reloc_root(struct btrfs_root *root)
634
{
635
	struct btrfs_fs_info *fs_info = root->fs_info;
636 637
	struct rb_node *rb_node;
	struct mapping_node *node;
638
	struct reloc_control *rc = fs_info->reloc_ctl;
639 640

	node = kmalloc(sizeof(*node), GFP_NOFS);
641 642
	if (!node)
		return -ENOMEM;
643

644
	node->bytenr = root->commit_root->start;
645 646 647
	node->data = root;

	spin_lock(&rc->reloc_root_tree.lock);
648 649
	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
				   node->bytenr, &node->rb_node);
650
	spin_unlock(&rc->reloc_root_tree.lock);
651
	if (rb_node) {
652
		btrfs_err(fs_info,
J
Jeff Mahoney 已提交
653 654
			    "Duplicate root found for start=%llu while inserting into relocation tree",
			    node->bytenr);
655
		return -EEXIST;
656
	}
657 658 659 660 661 662

	list_add_tail(&root->root_list, &rc->reloc_roots);
	return 0;
}

/*
663
 * helper to delete the 'address of tree root -> reloc tree'
664 665
 * mapping
 */
666
static void __del_reloc_root(struct btrfs_root *root)
667
{
668
	struct btrfs_fs_info *fs_info = root->fs_info;
669 670
	struct rb_node *rb_node;
	struct mapping_node *node = NULL;
671
	struct reloc_control *rc = fs_info->reloc_ctl;
672
	bool put_ref = false;
673

674
	if (rc && root->node) {
675
		spin_lock(&rc->reloc_root_tree.lock);
676 677
		rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
					   root->commit_root->start);
678 679 680
		if (rb_node) {
			node = rb_entry(rb_node, struct mapping_node, rb_node);
			rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
681
			RB_CLEAR_NODE(&node->rb_node);
682 683
		}
		spin_unlock(&rc->reloc_root_tree.lock);
684
		ASSERT(!node || (struct btrfs_root *)node->data == root);
685 686
	}

687 688 689 690 691 692 693 694
	/*
	 * We only put the reloc root here if it's on the list.  There's a lot
	 * of places where the pattern is to splice the rc->reloc_roots, process
	 * the reloc roots, and then add the reloc root back onto
	 * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
	 * list we don't want the reference being dropped, because the guy
	 * messing with the list is in charge of the reference.
	 */
695
	spin_lock(&fs_info->trans_lock);
696 697 698 699
	if (!list_empty(&root->root_list)) {
		put_ref = true;
		list_del_init(&root->root_list);
	}
700
	spin_unlock(&fs_info->trans_lock);
701 702
	if (put_ref)
		btrfs_put_root(root);
703 704 705 706 707 708 709
	kfree(node);
}

/*
 * helper to update the 'address of tree root -> reloc tree'
 * mapping
 */
710
static int __update_reloc_root(struct btrfs_root *root)
711
{
712
	struct btrfs_fs_info *fs_info = root->fs_info;
713 714
	struct rb_node *rb_node;
	struct mapping_node *node = NULL;
715
	struct reloc_control *rc = fs_info->reloc_ctl;
716 717

	spin_lock(&rc->reloc_root_tree.lock);
718 719
	rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
				   root->commit_root->start);
720 721 722
	if (rb_node) {
		node = rb_entry(rb_node, struct mapping_node, rb_node);
		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
723
	}
724 725 726 727 728 729 730
	spin_unlock(&rc->reloc_root_tree.lock);

	if (!node)
		return 0;
	BUG_ON((struct btrfs_root *)node->data != root);

	spin_lock(&rc->reloc_root_tree.lock);
731
	node->bytenr = root->node->start;
732 733
	rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
				   node->bytenr, &node->rb_node);
734 735
	spin_unlock(&rc->reloc_root_tree.lock);
	if (rb_node)
736
		btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
737 738 739
	return 0;
}

740 741
static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
					struct btrfs_root *root, u64 objectid)
742
{
743
	struct btrfs_fs_info *fs_info = root->fs_info;
744 745 746 747
	struct btrfs_root *reloc_root;
	struct extent_buffer *eb;
	struct btrfs_root_item *root_item;
	struct btrfs_key root_key;
748 749
	int ret = 0;
	bool must_abort = false;
750 751

	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
752 753
	if (!root_item)
		return ERR_PTR(-ENOMEM);
754 755 756

	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
	root_key.type = BTRFS_ROOT_ITEM_KEY;
757
	root_key.offset = objectid;
758

759
	if (root->root_key.objectid == objectid) {
760 761
		u64 commit_root_gen;

762 763 764
		/* called by btrfs_init_reloc_root */
		ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
				      BTRFS_TREE_RELOC_OBJECTID);
765 766 767
		if (ret)
			goto fail;

768 769 770 771 772 773 774 775 776 777
		/*
		 * Set the last_snapshot field to the generation of the commit
		 * root - like this ctree.c:btrfs_block_can_be_shared() behaves
		 * correctly (returns true) when the relocation root is created
		 * either inside the critical section of a transaction commit
		 * (through transaction.c:qgroup_account_snapshot()) and when
		 * it's created before the transaction commit is started.
		 */
		commit_root_gen = btrfs_header_generation(root->commit_root);
		btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
778 779 780 781 782 783 784 785 786 787
	} else {
		/*
		 * called by btrfs_reloc_post_snapshot_hook.
		 * the source tree is a reloc tree, all tree blocks
		 * modified after it was created have RELOC flag
		 * set in their headers. so it's OK to not update
		 * the 'last_snapshot'.
		 */
		ret = btrfs_copy_root(trans, root, root->node, &eb,
				      BTRFS_TREE_RELOC_OBJECTID);
788 789
		if (ret)
			goto fail;
790
	}
791

792 793 794 795 796 797
	/*
	 * We have changed references at this point, we must abort the
	 * transaction if anything fails.
	 */
	must_abort = true;

798 799 800 801
	memcpy(root_item, &root->root_item, sizeof(*root_item));
	btrfs_set_root_bytenr(root_item, eb->start);
	btrfs_set_root_level(root_item, btrfs_header_level(eb));
	btrfs_set_root_generation(root_item, trans->transid);
802 803 804 805 806

	if (root->root_key.objectid == objectid) {
		btrfs_set_root_refs(root_item, 0);
		memset(&root_item->drop_progress, 0,
		       sizeof(struct btrfs_disk_key));
807
		btrfs_set_root_drop_level(root_item, 0);
808
	}
809 810 811 812

	btrfs_tree_unlock(eb);
	free_extent_buffer(eb);

813
	ret = btrfs_insert_root(trans, fs_info->tree_root,
814
				&root_key, root_item);
815 816 817
	if (ret)
		goto fail;

818 819
	kfree(root_item);

820
	reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
821 822 823 824
	if (IS_ERR(reloc_root)) {
		ret = PTR_ERR(reloc_root);
		goto abort;
	}
825
	set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
826
	reloc_root->last_trans = trans->transid;
827
	return reloc_root;
828 829 830 831 832 833
fail:
	kfree(root_item);
abort:
	if (must_abort)
		btrfs_abort_transaction(trans, ret);
	return ERR_PTR(ret);
834 835 836 837 838
}

/*
 * create reloc tree for a given fs tree. reloc tree is just a
 * snapshot of the fs tree with special root objectid.
839 840 841
 *
 * The reloc_root comes out of here with two references, one for
 * root->reloc_root, and another for being on the rc->reloc_roots list.
842 843 844 845
 */
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root)
{
846
	struct btrfs_fs_info *fs_info = root->fs_info;
847
	struct btrfs_root *reloc_root;
848
	struct reloc_control *rc = fs_info->reloc_ctl;
849
	struct btrfs_block_rsv *rsv;
850
	int clear_rsv = 0;
851
	int ret;
852

853
	if (!rc)
854 855
		return 0;

856 857 858 859
	/*
	 * The subvolume has reloc tree but the swap is finished, no need to
	 * create/update the dead reloc tree
	 */
860
	if (reloc_root_is_dead(root))
861 862
		return 0;

863 864 865 866 867 868 869 870
	/*
	 * This is subtle but important.  We do not do
	 * record_root_in_transaction for reloc roots, instead we record their
	 * corresponding fs root, and then here we update the last trans for the
	 * reloc root.  This means that we have to do this for the entire life
	 * of the reloc root, regardless of which stage of the relocation we are
	 * in.
	 */
871 872 873 874 875 876
	if (root->reloc_root) {
		reloc_root = root->reloc_root;
		reloc_root->last_trans = trans->transid;
		return 0;
	}

877 878 879 880 881 882 883 884
	/*
	 * We are merging reloc roots, we do not need new reloc trees.  Also
	 * reloc trees never need their own reloc tree.
	 */
	if (!rc->create_reloc_tree ||
	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
		return 0;

885 886
	if (!trans->reloc_reserved) {
		rsv = trans->block_rsv;
887 888 889 890 891
		trans->block_rsv = rc->block_rsv;
		clear_rsv = 1;
	}
	reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
	if (clear_rsv)
892
		trans->block_rsv = rsv;
893 894
	if (IS_ERR(reloc_root))
		return PTR_ERR(reloc_root);
895

896
	ret = __add_reloc_root(reloc_root);
897
	ASSERT(ret != -EEXIST);
898 899 900 901 902
	if (ret) {
		/* Pairs with create_reloc_root */
		btrfs_put_root(reloc_root);
		return ret;
	}
903
	root->reloc_root = btrfs_grab_root(reloc_root);
904 905 906 907 908 909 910 911 912
	return 0;
}

/*
 * update root item of reloc tree
 */
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root)
{
913
	struct btrfs_fs_info *fs_info = root->fs_info;
914 915 916 917
	struct btrfs_root *reloc_root;
	struct btrfs_root_item *root_item;
	int ret;

918
	if (!have_reloc_root(root))
919
		return 0;
920 921 922 923

	reloc_root = root->reloc_root;
	root_item = &reloc_root->root_item;

924 925 926 927 928 929 930
	/*
	 * We are probably ok here, but __del_reloc_root() will drop its ref of
	 * the root.  We have the ref for root->reloc_root, but just in case
	 * hold it while we update the reloc root.
	 */
	btrfs_grab_root(reloc_root);

931
	/* root->reloc_root will stay until current relocation finished */
932
	if (fs_info->reloc_ctl->merge_reloc_tree &&
933
	    btrfs_root_refs(root_item) == 0) {
934
		set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
935 936 937 938 939
		/*
		 * Mark the tree as dead before we change reloc_root so
		 * have_reloc_root will not touch it from now on.
		 */
		smp_wmb();
940
		__del_reloc_root(reloc_root);
941 942 943
	}

	if (reloc_root->commit_root != reloc_root->node) {
944
		__update_reloc_root(reloc_root);
945 946 947 948 949
		btrfs_set_root_node(root_item, reloc_root->node);
		free_extent_buffer(reloc_root->commit_root);
		reloc_root->commit_root = btrfs_root_node(reloc_root);
	}

950
	ret = btrfs_update_root(trans, fs_info->tree_root,
951
				&reloc_root->root_key, root_item);
952
	btrfs_put_root(reloc_root);
953
	return ret;
954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974
}

/*
 * helper to find first cached inode with inode number >= objectid
 * in a subvolume
 */
static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
{
	struct rb_node *node;
	struct rb_node *prev;
	struct btrfs_inode *entry;
	struct inode *inode;

	spin_lock(&root->inode_lock);
again:
	node = root->inode_tree.rb_node;
	prev = NULL;
	while (node) {
		prev = node;
		entry = rb_entry(node, struct btrfs_inode, rb_node);

975
		if (objectid < btrfs_ino(entry))
976
			node = node->rb_left;
977
		else if (objectid > btrfs_ino(entry))
978 979 980 981 982 983 984
			node = node->rb_right;
		else
			break;
	}
	if (!node) {
		while (prev) {
			entry = rb_entry(prev, struct btrfs_inode, rb_node);
985
			if (objectid <= btrfs_ino(entry)) {
986 987 988 989 990 991 992 993 994 995 996 997 998 999
				node = prev;
				break;
			}
			prev = rb_next(prev);
		}
	}
	while (node) {
		entry = rb_entry(node, struct btrfs_inode, rb_node);
		inode = igrab(&entry->vfs_inode);
		if (inode) {
			spin_unlock(&root->inode_lock);
			return inode;
		}

1000
		objectid = btrfs_ino(entry) + 1;
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
		if (cond_resched_lock(&root->inode_lock))
			goto again;

		node = rb_next(node);
	}
	spin_unlock(&root->inode_lock);
	return NULL;
}

/*
 * get new location of data
 */
static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
			    u64 bytenr, u64 num_bytes)
{
	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
	struct btrfs_path *path;
	struct btrfs_file_extent_item *fi;
	struct extent_buffer *leaf;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1027 1028
	ret = btrfs_lookup_file_extent(NULL, root, path,
			btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	fi = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);

	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
	       btrfs_file_extent_compression(leaf, fi) ||
	       btrfs_file_extent_encryption(leaf, fi) ||
	       btrfs_file_extent_other_encoding(leaf, fi));

	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1046
		ret = -EINVAL;
1047 1048 1049
		goto out;
	}

1050
	*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1051 1052 1053 1054 1055 1056 1057 1058 1059 1060
	ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

/*
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
1061 1062 1063 1064 1065
static noinline_for_stack
int replace_file_extents(struct btrfs_trans_handle *trans,
			 struct reloc_control *rc,
			 struct btrfs_root *root,
			 struct extent_buffer *leaf)
1066
{
1067
	struct btrfs_fs_info *fs_info = root->fs_info;
1068 1069 1070 1071 1072
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
	struct inode *inode = NULL;
	u64 parent;
	u64 bytenr;
1073
	u64 new_bytenr = 0;
1074 1075 1076 1077
	u64 num_bytes;
	u64 end;
	u32 nritems;
	u32 i;
1078
	int ret = 0;
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
	int first = 1;
	int dirty = 0;

	if (rc->stage != UPDATE_DATA_PTRS)
		return 0;

	/* reloc trees always use full backref */
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
		parent = leaf->start;
	else
		parent = 0;

	nritems = btrfs_header_nritems(leaf);
	for (i = 0; i < nritems; i++) {
1093 1094
		struct btrfs_ref ref = { 0 };

1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
		cond_resched();
		btrfs_item_key_to_cpu(leaf, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
		if (btrfs_file_extent_type(leaf, fi) ==
		    BTRFS_FILE_EXTENT_INLINE)
			continue;
		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
		if (bytenr == 0)
			continue;
1107 1108
		if (!in_range(bytenr, rc->block_group->start,
			      rc->block_group->length))
1109 1110 1111
			continue;

		/*
1112
		 * if we are modifying block in fs tree, wait for read_folio
1113 1114 1115 1116 1117 1118
		 * to complete and drop the extent cache
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
			if (first) {
				inode = find_next_inode(root, key.objectid);
				first = 0;
1119
			} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1120
				btrfs_add_delayed_iput(inode);
1121 1122
				inode = find_next_inode(root, key.objectid);
			}
1123
			if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1124 1125
				struct extent_state *cached_state = NULL;

1126 1127 1128
				end = key.offset +
				      btrfs_file_extent_num_bytes(leaf, fi);
				WARN_ON(!IS_ALIGNED(key.offset,
1129 1130
						    fs_info->sectorsize));
				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1131 1132
				end--;
				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1133 1134
						      key.offset, end,
						      &cached_state);
1135 1136 1137
				if (!ret)
					continue;

1138 1139
				btrfs_drop_extent_map_range(BTRFS_I(inode),
							    key.offset, end, true);
1140
				unlock_extent(&BTRFS_I(inode)->io_tree,
1141
					      key.offset, end, &cached_state);
1142 1143 1144 1145 1146
			}
		}

		ret = get_new_location(rc->data_inode, &new_bytenr,
				       bytenr, num_bytes);
1147 1148 1149 1150 1151 1152
		if (ret) {
			/*
			 * Don't have to abort since we've not changed anything
			 * in the file extent yet.
			 */
			break;
1153
		}
1154 1155 1156 1157 1158

		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
		dirty = 1;

		key.offset -= btrfs_file_extent_offset(leaf, fi);
1159 1160 1161
		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
				       num_bytes, parent);
		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1162 1163
				    key.objectid, key.offset,
				    root->root_key.objectid, false);
1164
		ret = btrfs_inc_extent_ref(trans, &ref);
1165
		if (ret) {
1166
			btrfs_abort_transaction(trans, ret);
1167 1168
			break;
		}
1169

1170 1171 1172
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       num_bytes, parent);
		btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1173 1174
				    key.objectid, key.offset,
				    root->root_key.objectid, false);
1175
		ret = btrfs_free_extent(trans, &ref);
1176
		if (ret) {
1177
			btrfs_abort_transaction(trans, ret);
1178 1179
			break;
		}
1180 1181 1182
	}
	if (dirty)
		btrfs_mark_buffer_dirty(leaf);
1183 1184
	if (inode)
		btrfs_add_delayed_iput(inode);
1185
	return ret;
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207
}

static noinline_for_stack
int memcmp_node_keys(struct extent_buffer *eb, int slot,
		     struct btrfs_path *path, int level)
{
	struct btrfs_disk_key key1;
	struct btrfs_disk_key key2;
	btrfs_node_key(eb, &key1, slot);
	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
	return memcmp(&key1, &key2, sizeof(key1));
}

/*
 * try to replace tree blocks in fs tree with the new blocks
 * in reloc tree. tree blocks haven't been modified since the
 * reloc tree was create can be replaced.
 *
 * if a block was replaced, level of the block + 1 is returned.
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
1208
static noinline_for_stack
1209
int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1210 1211 1212
		 struct btrfs_root *dest, struct btrfs_root *src,
		 struct btrfs_path *path, struct btrfs_key *next_key,
		 int lowest_level, int max_level)
1213
{
1214
	struct btrfs_fs_info *fs_info = dest->fs_info;
1215 1216
	struct extent_buffer *eb;
	struct extent_buffer *parent;
1217
	struct btrfs_ref ref = { 0 };
1218 1219 1220 1221 1222 1223 1224
	struct btrfs_key key;
	u64 old_bytenr;
	u64 new_bytenr;
	u64 old_ptr_gen;
	u64 new_ptr_gen;
	u64 last_snapshot;
	u32 blocksize;
1225
	int cow = 0;
1226 1227 1228 1229
	int level;
	int ret;
	int slot;

1230 1231
	ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
	ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1232 1233

	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1234
again:
1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246
	slot = path->slots[lowest_level];
	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);

	eb = btrfs_lock_root_node(dest);
	level = btrfs_header_level(eb);

	if (level < lowest_level) {
		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
		return 0;
	}

1247
	if (cow) {
1248 1249
		ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
				      BTRFS_NESTING_COW);
1250 1251 1252 1253 1254
		if (ret) {
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
			return ret;
		}
1255
	}
1256 1257 1258 1259 1260 1261 1262 1263 1264 1265

	if (next_key) {
		next_key->objectid = (u64)-1;
		next_key->type = (u8)-1;
		next_key->offset = (u64)-1;
	}

	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
1266
		ASSERT(level >= lowest_level);
1267

1268
		ret = btrfs_bin_search(parent, &key, &slot);
1269 1270
		if (ret < 0)
			break;
1271 1272 1273 1274 1275 1276 1277
		if (ret && slot > 0)
			slot--;

		if (next_key && slot + 1 < btrfs_header_nritems(parent))
			btrfs_node_key_to_cpu(parent, next_key, slot + 1);

		old_bytenr = btrfs_node_blockptr(parent, slot);
1278
		blocksize = fs_info->nodesize;
1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);

		if (level <= max_level) {
			eb = path->nodes[level];
			new_bytenr = btrfs_node_blockptr(eb,
							path->slots[level]);
			new_ptr_gen = btrfs_node_ptr_generation(eb,
							path->slots[level]);
		} else {
			new_bytenr = 0;
			new_ptr_gen = 0;
		}

1292
		if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1293 1294 1295 1296 1297 1298
			ret = level;
			break;
		}

		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
		    memcmp_node_keys(parent, slot, path, level)) {
1299
			if (level <= lowest_level) {
1300 1301 1302 1303
				ret = 0;
				break;
			}

1304
			eb = btrfs_read_node_slot(parent, slot);
1305 1306
			if (IS_ERR(eb)) {
				ret = PTR_ERR(eb);
1307
				break;
1308
			}
1309
			btrfs_tree_lock(eb);
1310 1311
			if (cow) {
				ret = btrfs_cow_block(trans, dest, eb, parent,
1312 1313
						      slot, &eb,
						      BTRFS_NESTING_COW);
1314 1315 1316 1317 1318
				if (ret) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
					break;
				}
1319 1320 1321 1322 1323 1324 1325 1326 1327
			}

			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);

			parent = eb;
			continue;
		}

1328 1329 1330 1331 1332 1333 1334
		if (!cow) {
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			cow = 1;
			goto again;
		}

1335 1336
		btrfs_node_key_to_cpu(path->nodes[level], &key,
				      path->slots[level]);
1337
		btrfs_release_path(path);
1338 1339

		path->lowest_level = level;
1340
		set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1341
		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1342
		clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1343
		path->lowest_level = 0;
1344 1345 1346 1347 1348
		if (ret) {
			if (ret > 0)
				ret = -ENOENT;
			break;
		}
1349

1350 1351 1352 1353 1354 1355 1356 1357
		/*
		 * Info qgroup to trace both subtrees.
		 *
		 * We must trace both trees.
		 * 1) Tree reloc subtree
		 *    If not traced, we will leak data numbers
		 * 2) Fs subtree
		 *    If not traced, we will double count old data
1358 1359 1360 1361 1362
		 *
		 * We don't scan the subtree right now, but only record
		 * the swapped tree blocks.
		 * The real subtree rescan is delayed until we have new
		 * CoW on the subtree root node before transaction commit.
1363
		 */
1364 1365 1366 1367 1368 1369
		ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
				rc->block_group, parent, slot,
				path->nodes[level], path->slots[level],
				last_snapshot);
		if (ret < 0)
			break;
1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382
		/*
		 * swap blocks in fs tree and reloc tree.
		 */
		btrfs_set_node_blockptr(parent, slot, new_bytenr);
		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
		btrfs_mark_buffer_dirty(parent);

		btrfs_set_node_blockptr(path->nodes[level],
					path->slots[level], old_bytenr);
		btrfs_set_node_ptr_generation(path->nodes[level],
					      path->slots[level], old_ptr_gen);
		btrfs_mark_buffer_dirty(path->nodes[level]);

1383 1384
		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
				       blocksize, path->nodes[level]->start);
1385 1386
		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
				    0, true);
1387
		ret = btrfs_inc_extent_ref(trans, &ref);
1388 1389 1390 1391
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			break;
		}
1392 1393
		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
				       blocksize, 0);
1394 1395
		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
				    true);
1396
		ret = btrfs_inc_extent_ref(trans, &ref);
1397 1398 1399 1400
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			break;
		}
1401

1402 1403
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
				       blocksize, path->nodes[level]->start);
1404 1405
		btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
				    0, true);
1406
		ret = btrfs_free_extent(trans, &ref);
1407 1408 1409 1410
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			break;
		}
1411

1412 1413
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
				       blocksize, 0);
1414 1415
		btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
				    0, true);
1416
		ret = btrfs_free_extent(trans, &ref);
1417 1418 1419 1420
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			break;
		}
1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503

		btrfs_unlock_up_safe(path, 0);

		ret = level;
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return ret;
}

/*
 * helper to find next relocated block in reloc tree
 */
static noinline_for_stack
int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
		       int *level)
{
	struct extent_buffer *eb;
	int i;
	u64 last_snapshot;
	u32 nritems;

	last_snapshot = btrfs_root_last_snapshot(&root->root_item);

	for (i = 0; i < *level; i++) {
		free_extent_buffer(path->nodes[i]);
		path->nodes[i] = NULL;
	}

	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
		eb = path->nodes[i];
		nritems = btrfs_header_nritems(eb);
		while (path->slots[i] + 1 < nritems) {
			path->slots[i]++;
			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
			    last_snapshot)
				continue;

			*level = i;
			return 0;
		}
		free_extent_buffer(path->nodes[i]);
		path->nodes[i] = NULL;
	}
	return 1;
}

/*
 * walk down reloc tree to find relocated block of lowest level
 */
static noinline_for_stack
int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
			 int *level)
{
	struct extent_buffer *eb = NULL;
	int i;
	u64 ptr_gen = 0;
	u64 last_snapshot;
	u32 nritems;

	last_snapshot = btrfs_root_last_snapshot(&root->root_item);

	for (i = *level; i > 0; i--) {
		eb = path->nodes[i];
		nritems = btrfs_header_nritems(eb);
		while (path->slots[i] < nritems) {
			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
			if (ptr_gen > last_snapshot)
				break;
			path->slots[i]++;
		}
		if (path->slots[i] >= nritems) {
			if (i == *level)
				break;
			*level = i + 1;
			return 0;
		}
		if (i == 1) {
			*level = i;
			return 0;
		}

1504 1505
		eb = btrfs_read_node_slot(eb, path->slots[i]);
		if (IS_ERR(eb))
1506
			return PTR_ERR(eb);
1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521
		BUG_ON(btrfs_header_level(eb) != i - 1);
		path->nodes[i - 1] = eb;
		path->slots[i - 1] = 0;
	}
	return 1;
}

/*
 * invalidate extent cache for file extents whose key in range of
 * [min_key, max_key)
 */
static int invalidate_extent_cache(struct btrfs_root *root,
				   struct btrfs_key *min_key,
				   struct btrfs_key *max_key)
{
1522
	struct btrfs_fs_info *fs_info = root->fs_info;
1523 1524 1525
	struct inode *inode = NULL;
	u64 objectid;
	u64 start, end;
L
Li Zefan 已提交
1526
	u64 ino;
1527 1528 1529

	objectid = min_key->objectid;
	while (1) {
1530 1531
		struct extent_state *cached_state = NULL;

1532 1533 1534 1535 1536 1537 1538 1539 1540
		cond_resched();
		iput(inode);

		if (objectid > max_key->objectid)
			break;

		inode = find_next_inode(root, objectid);
		if (!inode)
			break;
1541
		ino = btrfs_ino(BTRFS_I(inode));
1542

L
Li Zefan 已提交
1543
		if (ino > max_key->objectid) {
1544 1545 1546 1547
			iput(inode);
			break;
		}

L
Li Zefan 已提交
1548
		objectid = ino + 1;
1549 1550 1551
		if (!S_ISREG(inode->i_mode))
			continue;

L
Li Zefan 已提交
1552
		if (unlikely(min_key->objectid == ino)) {
1553 1554 1555 1556 1557 1558
			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
				continue;
			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
				start = 0;
			else {
				start = min_key->offset;
1559
				WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1560 1561 1562 1563 1564
			}
		} else {
			start = 0;
		}

L
Li Zefan 已提交
1565
		if (unlikely(max_key->objectid == ino)) {
1566 1567 1568 1569 1570 1571 1572 1573
			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
				continue;
			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
				end = (u64)-1;
			} else {
				if (max_key->offset == 0)
					continue;
				end = max_key->offset;
1574
				WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1575 1576 1577 1578 1579 1580
				end--;
			}
		} else {
			end = (u64)-1;
		}

1581
		/* the lock_extent waits for read_folio to complete */
1582
		lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1583
		btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
1584
		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606
	}
	return 0;
}

static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)

{
	while (level < BTRFS_MAX_LEVEL) {
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
			return 0;
		}
		level++;
	}
	return 1;
}

1607 1608 1609
/*
 * Insert current subvolume into reloc_control::dirty_subvol_roots
 */
1610 1611 1612
static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
			       struct reloc_control *rc,
			       struct btrfs_root *root)
1613 1614 1615
{
	struct btrfs_root *reloc_root = root->reloc_root;
	struct btrfs_root_item *reloc_root_item;
1616
	int ret;
1617 1618 1619 1620 1621 1622 1623 1624

	/* @root must be a subvolume tree root with a valid reloc tree */
	ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
	ASSERT(reloc_root);

	reloc_root_item = &reloc_root->root_item;
	memset(&reloc_root_item->drop_progress, 0,
		sizeof(reloc_root_item->drop_progress));
1625
	btrfs_set_root_drop_level(reloc_root_item, 0);
1626
	btrfs_set_root_refs(reloc_root_item, 0);
1627 1628 1629
	ret = btrfs_update_reloc_root(trans, root);
	if (ret)
		return ret;
1630 1631

	if (list_empty(&root->reloc_dirty_list)) {
1632
		btrfs_grab_root(root);
1633 1634
		list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
	}
1635 1636

	return 0;
1637 1638 1639 1640 1641 1642 1643
}

static int clean_dirty_subvols(struct reloc_control *rc)
{
	struct btrfs_root *root;
	struct btrfs_root *next;
	int ret = 0;
1644
	int ret2;
1645 1646 1647

	list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
				 reloc_dirty_list) {
1648 1649 1650
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
			/* Merged subvolume, cleanup its reloc root */
			struct btrfs_root *reloc_root = root->reloc_root;
1651

1652 1653
			list_del_init(&root->reloc_dirty_list);
			root->reloc_root = NULL;
1654 1655 1656 1657 1658
			/*
			 * Need barrier to ensure clear_bit() only happens after
			 * root->reloc_root = NULL. Pairs with have_reloc_root.
			 */
			smp_wmb();
1659
			clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1660
			if (reloc_root) {
1661 1662 1663 1664 1665
				/*
				 * btrfs_drop_snapshot drops our ref we hold for
				 * ->reloc_root.  If it fails however we must
				 * drop the ref ourselves.
				 */
1666
				ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1667 1668 1669 1670 1671
				if (ret2 < 0) {
					btrfs_put_root(reloc_root);
					if (!ret)
						ret = ret2;
				}
1672
			}
1673
			btrfs_put_root(root);
1674 1675
		} else {
			/* Orphan reloc tree, just clean it up */
1676
			ret2 = btrfs_drop_snapshot(root, 0, 1);
1677 1678 1679 1680 1681
			if (ret2 < 0) {
				btrfs_put_root(root);
				if (!ret)
					ret = ret2;
			}
1682 1683 1684 1685 1686
		}
	}
	return ret;
}

1687 1688 1689 1690 1691 1692 1693
/*
 * merge the relocated tree blocks in reloc tree with corresponding
 * fs tree.
 */
static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
					       struct btrfs_root *root)
{
1694
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1695 1696
	struct btrfs_key key;
	struct btrfs_key next_key;
1697
	struct btrfs_trans_handle *trans = NULL;
1698 1699 1700
	struct btrfs_root *reloc_root;
	struct btrfs_root_item *root_item;
	struct btrfs_path *path;
1701
	struct extent_buffer *leaf;
1702
	int reserve_level;
1703 1704 1705
	int level;
	int max_level;
	int replaced = 0;
1706
	int ret = 0;
1707
	u32 min_reserved;
1708 1709 1710 1711

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1712
	path->reada = READA_FORWARD;
1713 1714 1715 1716 1717 1718

	reloc_root = root->reloc_root;
	root_item = &reloc_root->root_item;

	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
		level = btrfs_root_level(root_item);
D
David Sterba 已提交
1719
		atomic_inc(&reloc_root->node->refs);
1720 1721 1722 1723 1724
		path->nodes[level] = reloc_root->node;
		path->slots[level] = 0;
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);

1725
		level = btrfs_root_drop_level(root_item);
1726 1727 1728
		BUG_ON(level == 0);
		path->lowest_level = level;
		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1729
		path->lowest_level = 0;
1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741
		if (ret < 0) {
			btrfs_free_path(path);
			return ret;
		}

		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
				      path->slots[level]);
		WARN_ON(memcmp(&key, &next_key, sizeof(key)));

		btrfs_unlock_up_safe(path, 0);
	}

1742 1743 1744 1745 1746 1747 1748 1749
	/*
	 * In merge_reloc_root(), we modify the upper level pointer to swap the
	 * tree blocks between reloc tree and subvolume tree.  Thus for tree
	 * block COW, we COW at most from level 1 to root level for each tree.
	 *
	 * Thus the needed metadata size is at most root_level * nodesize,
	 * and * 2 since we have two trees to COW.
	 */
1750 1751
	reserve_level = max_t(int, 1, btrfs_root_level(root_item));
	min_reserved = fs_info->nodesize * reserve_level * 2;
1752
	memset(&next_key, 0, sizeof(next_key));
1753

1754
	while (1) {
1755 1756
		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
					     min_reserved,
1757
					     BTRFS_RESERVE_FLUSH_LIMIT);
1758
		if (ret)
1759 1760 1761
			goto out;
		trans = btrfs_start_transaction(root, 0);
		if (IS_ERR(trans)) {
1762
			ret = PTR_ERR(trans);
1763 1764 1765
			trans = NULL;
			goto out;
		}
1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777

		/*
		 * At this point we no longer have a reloc_control, so we can't
		 * depend on btrfs_init_reloc_root to update our last_trans.
		 *
		 * But that's ok, we started the trans handle on our
		 * corresponding fs_root, which means it's been added to the
		 * dirty list.  At commit time we'll still call
		 * btrfs_update_reloc_root() and update our root item
		 * appropriately.
		 */
		reloc_root->last_trans = trans->transid;
1778
		trans->block_rsv = rc->block_rsv;
1779 1780 1781 1782 1783

		replaced = 0;
		max_level = level;

		ret = walk_down_reloc_tree(reloc_root, path, &level);
1784
		if (ret < 0)
1785 1786 1787 1788 1789 1790 1791 1792
			goto out;
		if (ret > 0)
			break;

		if (!find_next_key(path, level, &key) &&
		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
			ret = 0;
		} else {
1793
			ret = replace_path(trans, rc, root, reloc_root, path,
1794
					   &next_key, level, max_level);
1795
		}
1796
		if (ret < 0)
1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815
			goto out;
		if (ret > 0) {
			level = ret;
			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			replaced = 1;
		}

		ret = walk_up_reloc_tree(reloc_root, path, &level);
		if (ret > 0)
			break;

		BUG_ON(level == 0);
		/*
		 * save the merging progress in the drop_progress.
		 * this is OK since root refs == 1 in this case.
		 */
		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
			       path->slots[level]);
1816
		btrfs_set_root_drop_level(root_item, level);
1817

1818
		btrfs_end_transaction_throttle(trans);
1819
		trans = NULL;
1820

1821
		btrfs_btree_balance_dirty(fs_info);
1822 1823 1824 1825 1826 1827 1828 1829 1830 1831

		if (replaced && rc->stage == UPDATE_DATA_PTRS)
			invalidate_extent_cache(root, &key, &next_key);
	}

	/*
	 * handle the case only one block in the fs tree need to be
	 * relocated and the block is tree root.
	 */
	leaf = btrfs_lock_root_node(root);
1832 1833
	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
			      BTRFS_NESTING_COW);
1834 1835 1836 1837 1838
	btrfs_tree_unlock(leaf);
	free_extent_buffer(leaf);
out:
	btrfs_free_path(path);

1839 1840 1841 1842 1843
	if (ret == 0) {
		ret = insert_dirty_subvol(trans, rc, root);
		if (ret)
			btrfs_abort_transaction(trans, ret);
	}
1844

1845
	if (trans)
1846
		btrfs_end_transaction_throttle(trans);
1847

1848
	btrfs_btree_balance_dirty(fs_info);
1849 1850 1851 1852

	if (replaced && rc->stage == UPDATE_DATA_PTRS)
		invalidate_extent_cache(root, &key, &next_key);

1853
	return ret;
1854 1855
}

1856 1857
static noinline_for_stack
int prepare_to_merge(struct reloc_control *rc, int err)
1858
{
1859
	struct btrfs_root *root = rc->extent_root;
1860
	struct btrfs_fs_info *fs_info = root->fs_info;
1861
	struct btrfs_root *reloc_root;
1862 1863 1864 1865 1866
	struct btrfs_trans_handle *trans;
	LIST_HEAD(reloc_roots);
	u64 num_bytes = 0;
	int ret;

1867 1868
	mutex_lock(&fs_info->reloc_mutex);
	rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1869
	rc->merging_rsv_size += rc->nodes_relocated * 2;
1870
	mutex_unlock(&fs_info->reloc_mutex);
C
Chris Mason 已提交
1871

1872 1873 1874
again:
	if (!err) {
		num_bytes = rc->merging_rsv_size;
1875
		ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
M
Miao Xie 已提交
1876
					  BTRFS_RESERVE_FLUSH_ALL);
1877 1878 1879 1880
		if (ret)
			err = ret;
	}

1881
	trans = btrfs_join_transaction(rc->extent_root);
1882 1883
	if (IS_ERR(trans)) {
		if (!err)
1884
			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1885
						num_bytes, NULL);
1886 1887
		return PTR_ERR(trans);
	}
1888 1889 1890

	if (!err) {
		if (num_bytes != rc->merging_rsv_size) {
1891
			btrfs_end_transaction(trans);
1892
			btrfs_block_rsv_release(fs_info, rc->block_rsv,
1893
						num_bytes, NULL);
1894 1895 1896
			goto again;
		}
	}
1897

1898 1899 1900 1901 1902 1903
	rc->merge_reloc_tree = 1;

	while (!list_empty(&rc->reloc_roots)) {
		reloc_root = list_entry(rc->reloc_roots.next,
					struct btrfs_root, root_list);
		list_del_init(&reloc_root->root_list);
1904

D
David Sterba 已提交
1905 1906
		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
				false);
1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918
		if (IS_ERR(root)) {
			/*
			 * Even if we have an error we need this reloc root
			 * back on our list so we can clean up properly.
			 */
			list_add(&reloc_root->root_list, &reloc_roots);
			btrfs_abort_transaction(trans, (int)PTR_ERR(root));
			if (!err)
				err = PTR_ERR(root);
			break;
		}
		ASSERT(root->reloc_root == reloc_root);
1919

1920 1921 1922 1923 1924 1925
		/*
		 * set reference count to 1, so btrfs_recover_relocation
		 * knows it should resumes merging
		 */
		if (!err)
			btrfs_set_root_refs(&reloc_root->root_item, 1);
1926
		ret = btrfs_update_reloc_root(trans, root);
1927

1928 1929 1930 1931
		/*
		 * Even if we have an error we need this reloc root back on our
		 * list so we can clean up properly.
		 */
1932
		list_add(&reloc_root->root_list, &reloc_roots);
1933
		btrfs_put_root(root);
1934 1935 1936 1937 1938 1939 1940

		if (ret) {
			btrfs_abort_transaction(trans, ret);
			if (!err)
				err = ret;
			break;
		}
1941
	}
1942

1943
	list_splice(&reloc_roots, &rc->reloc_roots);
1944

1945
	if (!err)
1946
		err = btrfs_commit_transaction(trans);
1947
	else
1948
		btrfs_end_transaction(trans);
1949
	return err;
1950 1951
}

1952 1953 1954
static noinline_for_stack
void free_reloc_roots(struct list_head *list)
{
1955
	struct btrfs_root *reloc_root, *tmp;
1956

1957
	list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1958
		__del_reloc_root(reloc_root);
1959 1960
}

1961
static noinline_for_stack
1962
void merge_reloc_roots(struct reloc_control *rc)
1963
{
1964
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1965
	struct btrfs_root *root;
1966 1967 1968
	struct btrfs_root *reloc_root;
	LIST_HEAD(reloc_roots);
	int found = 0;
1969
	int ret = 0;
1970 1971
again:
	root = rc->extent_root;
C
Chris Mason 已提交
1972 1973 1974 1975 1976 1977 1978

	/*
	 * this serializes us with btrfs_record_root_in_transaction,
	 * we have to make sure nobody is in the middle of
	 * adding their roots to the list while we are
	 * doing this splice
	 */
1979
	mutex_lock(&fs_info->reloc_mutex);
1980
	list_splice_init(&rc->reloc_roots, &reloc_roots);
1981
	mutex_unlock(&fs_info->reloc_mutex);
1982

1983 1984 1985 1986
	while (!list_empty(&reloc_roots)) {
		found = 1;
		reloc_root = list_entry(reloc_roots.next,
					struct btrfs_root, root_list);
1987

D
David Sterba 已提交
1988 1989
		root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
					 false);
1990
		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
			if (IS_ERR(root)) {
				/*
				 * For recovery we read the fs roots on mount,
				 * and if we didn't find the root then we marked
				 * the reloc root as a garbage root.  For normal
				 * relocation obviously the root should exist in
				 * memory.  However there's no reason we can't
				 * handle the error properly here just in case.
				 */
				ASSERT(0);
				ret = PTR_ERR(root);
				goto out;
			}
			if (root->reloc_root != reloc_root) {
				/*
				 * This is actually impossible without something
				 * going really wrong (like weird race condition
				 * or cosmic rays).
				 */
				ASSERT(0);
				ret = -EINVAL;
				goto out;
			}
2014
			ret = merge_reloc_root(rc, root);
2015
			btrfs_put_root(root);
2016
			if (ret) {
2017 2018 2019
				if (list_empty(&reloc_root->root_list))
					list_add_tail(&reloc_root->root_list,
						      &reloc_roots);
2020
				goto out;
2021
			}
2022
		} else {
2023 2024 2025 2026 2027
			if (!IS_ERR(root)) {
				if (root->reloc_root == reloc_root) {
					root->reloc_root = NULL;
					btrfs_put_root(reloc_root);
				}
2028 2029
				clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
					  &root->state);
2030 2031 2032
				btrfs_put_root(root);
			}

2033
			list_del_init(&reloc_root->root_list);
2034 2035 2036
			/* Don't forget to queue this reloc root for cleanup */
			list_add_tail(&reloc_root->reloc_dirty_list,
				      &rc->dirty_subvol_roots);
2037
		}
2038 2039
	}

2040 2041 2042 2043
	if (found) {
		found = 0;
		goto again;
	}
2044 2045
out:
	if (ret) {
2046
		btrfs_handle_fs_error(fs_info, ret, NULL);
2047
		free_reloc_roots(&reloc_roots);
2048 2049

		/* new reloc root may be added */
2050
		mutex_lock(&fs_info->reloc_mutex);
2051
		list_splice_init(&rc->reloc_roots, &reloc_roots);
2052
		mutex_unlock(&fs_info->reloc_mutex);
2053
		free_reloc_roots(&reloc_roots);
2054 2055
	}

2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070
	/*
	 * We used to have
	 *
	 * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
	 *
	 * here, but it's wrong.  If we fail to start the transaction in
	 * prepare_to_merge() we will have only 0 ref reloc roots, none of which
	 * have actually been removed from the reloc_root_tree rb tree.  This is
	 * fine because we're bailing here, and we hold a reference on the root
	 * for the list that holds it, so these roots will be cleaned up when we
	 * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
	 * will be cleaned up on unmount.
	 *
	 * The remaining nodes will be cleaned up by free_reloc_control.
	 */
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086
}

static void free_block_list(struct rb_root *blocks)
{
	struct tree_block *block;
	struct rb_node *rb_node;
	while ((rb_node = rb_first(blocks))) {
		block = rb_entry(rb_node, struct tree_block, rb_node);
		rb_erase(rb_node, blocks);
		kfree(block);
	}
}

static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
				      struct btrfs_root *reloc_root)
{
2087
	struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2088
	struct btrfs_root *root;
2089
	int ret;
2090 2091 2092 2093

	if (reloc_root->last_trans == trans->transid)
		return 0;

D
David Sterba 已提交
2094
	root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115

	/*
	 * This should succeed, since we can't have a reloc root without having
	 * already looked up the actual root and created the reloc root for this
	 * root.
	 *
	 * However if there's some sort of corruption where we have a ref to a
	 * reloc root without a corresponding root this could return ENOENT.
	 */
	if (IS_ERR(root)) {
		ASSERT(0);
		return PTR_ERR(root);
	}
	if (root->reloc_root != reloc_root) {
		ASSERT(0);
		btrfs_err(fs_info,
			  "root %llu has two reloc roots associated with it",
			  reloc_root->root_key.offset);
		btrfs_put_root(root);
		return -EUCLEAN;
	}
2116
	ret = btrfs_record_root_in_trans(trans, root);
2117
	btrfs_put_root(root);
2118

2119
	return ret;
2120 2121
}

2122 2123 2124
static noinline_for_stack
struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
				     struct reloc_control *rc,
2125 2126
				     struct btrfs_backref_node *node,
				     struct btrfs_backref_edge *edges[])
2127
{
2128
	struct btrfs_backref_node *next;
2129
	struct btrfs_root *root;
2130
	int index = 0;
2131
	int ret;
2132

2133 2134 2135 2136 2137
	next = node;
	while (1) {
		cond_resched();
		next = walk_up_backref(next, edges, &index);
		root = next->root;
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164

		/*
		 * If there is no root, then our references for this block are
		 * incomplete, as we should be able to walk all the way up to a
		 * block that is owned by a root.
		 *
		 * This path is only for SHAREABLE roots, so if we come upon a
		 * non-SHAREABLE root then we have backrefs that resolve
		 * improperly.
		 *
		 * Both of these cases indicate file system corruption, or a bug
		 * in the backref walking code.
		 */
		if (!root) {
			ASSERT(0);
			btrfs_err(trans->fs_info,
		"bytenr %llu doesn't have a backref path ending in a root",
				  node->bytenr);
			return ERR_PTR(-EUCLEAN);
		}
		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
			ASSERT(0);
			btrfs_err(trans->fs_info,
	"bytenr %llu has multiple refs with one ending in a non-shareable root",
				  node->bytenr);
			return ERR_PTR(-EUCLEAN);
		}
2165 2166

		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2167 2168 2169
			ret = record_reloc_root_in_trans(trans, root);
			if (ret)
				return ERR_PTR(ret);
2170 2171 2172
			break;
		}

2173 2174 2175
		ret = btrfs_record_root_in_trans(trans, root);
		if (ret)
			return ERR_PTR(ret);
2176 2177
		root = root->reloc_root;

2178 2179 2180 2181 2182 2183 2184
		/*
		 * We could have raced with another thread which failed, so
		 * root->reloc_root may not be set, return ENOENT in this case.
		 */
		if (!root)
			return ERR_PTR(-ENOENT);

2185
		if (next->new_bytenr != root->node->start) {
2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201
			/*
			 * We just created the reloc root, so we shouldn't have
			 * ->new_bytenr set and this shouldn't be in the changed
			 *  list.  If it is then we have multiple roots pointing
			 *  at the same bytenr which indicates corruption, or
			 *  we've made a mistake in the backref walking code.
			 */
			ASSERT(next->new_bytenr == 0);
			ASSERT(list_empty(&next->list));
			if (next->new_bytenr || !list_empty(&next->list)) {
				btrfs_err(trans->fs_info,
	"bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
					  node->bytenr, next->bytenr);
				return ERR_PTR(-EUCLEAN);
			}

2202
			next->new_bytenr = root->node->start;
2203 2204
			btrfs_put_root(next->root);
			next->root = btrfs_grab_root(root);
2205
			ASSERT(next->root);
2206 2207
			list_add_tail(&next->list,
				      &rc->backref_cache.changed);
2208
			mark_block_processed(rc, next);
2209 2210 2211
			break;
		}

2212
		WARN_ON(1);
2213 2214 2215 2216 2217
		root = NULL;
		next = walk_down_backref(edges, &index);
		if (!next || next->level <= node->level)
			break;
	}
2218 2219 2220 2221 2222 2223 2224 2225
	if (!root) {
		/*
		 * This can happen if there's fs corruption or if there's a bug
		 * in the backref lookup code.
		 */
		ASSERT(0);
		return ERR_PTR(-ENOENT);
	}
2226

2227 2228 2229 2230 2231 2232 2233
	next = node;
	/* setup backref node path for btrfs_reloc_cow_block */
	while (1) {
		rc->backref_cache.path[next->level] = next;
		if (--index < 0)
			break;
		next = edges[index]->node[UPPER];
2234 2235 2236 2237
	}
	return root;
}

2238
/*
2239 2240 2241 2242 2243 2244 2245
 * Select a tree root for relocation.
 *
 * Return NULL if the block is not shareable. We should use do_relocation() in
 * this case.
 *
 * Return a tree root pointer if the block is shareable.
 * Return -ENOENT if the block is root of reloc tree.
2246
 */
2247
static noinline_for_stack
2248
struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2249
{
2250
	struct btrfs_backref_node *next;
2251 2252
	struct btrfs_root *root;
	struct btrfs_root *fs_root = NULL;
2253
	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2254 2255 2256 2257 2258 2259 2260
	int index = 0;

	next = node;
	while (1) {
		cond_resched();
		next = walk_up_backref(next, edges, &index);
		root = next->root;
2261 2262 2263 2264 2265 2266 2267

		/*
		 * This can occur if we have incomplete extent refs leading all
		 * the way up a particular path, in this case return -EUCLEAN.
		 */
		if (!root)
			return ERR_PTR(-EUCLEAN);
2268

2269 2270
		/* No other choice for non-shareable tree */
		if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286
			return root;

		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
			fs_root = root;

		if (next != node)
			return NULL;

		next = walk_down_backref(edges, &index);
		if (!next || next->level <= node->level)
			break;
	}

	if (!fs_root)
		return ERR_PTR(-ENOENT);
	return fs_root;
2287 2288 2289
}

static noinline_for_stack
2290
u64 calcu_metadata_size(struct reloc_control *rc,
2291
			struct btrfs_backref_node *node, int reserve)
2292
{
2293
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2294 2295 2296
	struct btrfs_backref_node *next = node;
	struct btrfs_backref_edge *edge;
	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307
	u64 num_bytes = 0;
	int index = 0;

	BUG_ON(reserve && node->processed);

	while (next) {
		cond_resched();
		while (1) {
			if (next->processed && (reserve || next != node))
				break;

2308
			num_bytes += fs_info->nodesize;
2309 2310 2311 2312 2313

			if (list_empty(&next->upper))
				break;

			edge = list_entry(next->upper.next,
2314
					struct btrfs_backref_edge, list[LOWER]);
2315 2316 2317 2318 2319 2320
			edges[index++] = edge;
			next = edge->node[UPPER];
		}
		next = walk_down_backref(edges, &index);
	}
	return num_bytes;
2321 2322
}

2323 2324
static int reserve_metadata_space(struct btrfs_trans_handle *trans,
				  struct reloc_control *rc,
2325
				  struct btrfs_backref_node *node)
2326
{
2327
	struct btrfs_root *root = rc->extent_root;
2328
	struct btrfs_fs_info *fs_info = root->fs_info;
2329 2330
	u64 num_bytes;
	int ret;
2331
	u64 tmp;
2332 2333

	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2334

2335
	trans->block_rsv = rc->block_rsv;
2336
	rc->reserved_bytes += num_bytes;
2337 2338 2339 2340 2341 2342

	/*
	 * We are under a transaction here so we can only do limited flushing.
	 * If we get an enospc just kick back -EAGAIN so we know to drop the
	 * transaction and try to refill when we can flush all the things.
	 */
2343 2344
	ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
				     BTRFS_RESERVE_FLUSH_LIMIT);
2345
	if (ret) {
2346
		tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2347 2348 2349 2350 2351 2352
		while (tmp <= rc->reserved_bytes)
			tmp <<= 1;
		/*
		 * only one thread can access block_rsv at this point,
		 * so we don't need hold lock to protect block_rsv.
		 * we expand more reservation size here to allow enough
2353
		 * space for relocation and we will return earlier in
2354 2355
		 * enospc case.
		 */
2356 2357
		rc->block_rsv->size = tmp + fs_info->nodesize *
				      RELOCATION_RESERVED_NODES;
2358
		return -EAGAIN;
2359
	}
2360 2361 2362 2363

	return 0;
}

2364 2365 2366 2367 2368 2369 2370 2371
/*
 * relocate a block tree, and then update pointers in upper level
 * blocks that reference the block to point to the new location.
 *
 * if called by link_to_upper, the block has already been relocated.
 * in that case this function just updates pointers.
 */
static int do_relocation(struct btrfs_trans_handle *trans,
2372
			 struct reloc_control *rc,
2373
			 struct btrfs_backref_node *node,
2374 2375 2376
			 struct btrfs_key *key,
			 struct btrfs_path *path, int lowest)
{
2377 2378 2379
	struct btrfs_backref_node *upper;
	struct btrfs_backref_edge *edge;
	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2380 2381 2382 2383 2384
	struct btrfs_root *root;
	struct extent_buffer *eb;
	u32 blocksize;
	u64 bytenr;
	int slot;
2385
	int ret = 0;
2386

2387 2388 2389 2390 2391
	/*
	 * If we are lowest then this is the first time we're processing this
	 * block, and thus shouldn't have an eb associated with it yet.
	 */
	ASSERT(!lowest || !node->eb);
2392 2393

	path->lowest_level = node->level + 1;
2394
	rc->backref_cache.path[node->level] = node;
2395
	list_for_each_entry(edge, &node->upper, list[LOWER]) {
2396
		struct btrfs_ref ref = { 0 };
2397

2398 2399 2400
		cond_resched();

		upper = edge->node[UPPER];
2401
		root = select_reloc_root(trans, rc, upper, edges);
2402 2403 2404 2405
		if (IS_ERR(root)) {
			ret = PTR_ERR(root);
			goto next;
		}
2406 2407 2408

		if (upper->eb && !upper->locked) {
			if (!lowest) {
2409
				ret = btrfs_bin_search(upper->eb, key, &slot);
2410
				if (ret < 0)
2411
					goto next;
2412 2413 2414 2415 2416
				BUG_ON(ret);
				bytenr = btrfs_node_blockptr(upper->eb, slot);
				if (node->eb->start == bytenr)
					goto next;
			}
2417
			btrfs_backref_drop_node_buffer(upper);
2418
		}
2419 2420 2421

		if (!upper->eb) {
			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2422
			if (ret) {
2423 2424
				if (ret > 0)
					ret = -ENOENT;
2425 2426

				btrfs_release_path(path);
2427 2428 2429
				break;
			}

2430 2431 2432 2433 2434 2435
			if (!upper->eb) {
				upper->eb = path->nodes[upper->level];
				path->nodes[upper->level] = NULL;
			} else {
				BUG_ON(upper->eb != path->nodes[upper->level]);
			}
2436

2437 2438
			upper->locked = 1;
			path->locks[upper->level] = 0;
2439

2440
			slot = path->slots[upper->level];
2441
			btrfs_release_path(path);
2442
		} else {
2443
			ret = btrfs_bin_search(upper->eb, key, &slot);
2444
			if (ret < 0)
2445
				goto next;
2446 2447 2448 2449
			BUG_ON(ret);
		}

		bytenr = btrfs_node_blockptr(upper->eb, slot);
2450
		if (lowest) {
L
Liu Bo 已提交
2451 2452 2453 2454 2455
			if (bytenr != node->bytenr) {
				btrfs_err(root->fs_info,
		"lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
					  bytenr, node->bytenr, slot,
					  upper->eb->start);
2456
				ret = -EIO;
L
Liu Bo 已提交
2457 2458
				goto next;
			}
2459
		} else {
2460 2461
			if (node->eb->start == bytenr)
				goto next;
2462 2463
		}

2464
		blocksize = root->fs_info->nodesize;
2465
		eb = btrfs_read_node_slot(upper->eb, slot);
2466
		if (IS_ERR(eb)) {
2467
			ret = PTR_ERR(eb);
2468
			goto next;
2469
		}
2470 2471 2472 2473
		btrfs_tree_lock(eb);

		if (!node->eb) {
			ret = btrfs_cow_block(trans, root, eb, upper->eb,
2474
					      slot, &eb, BTRFS_NESTING_COW);
2475 2476
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
2477
			if (ret < 0)
2478
				goto next;
2479 2480 2481 2482 2483
			/*
			 * We've just COWed this block, it should have updated
			 * the correct backref node entry.
			 */
			ASSERT(node->eb == eb);
2484 2485 2486 2487 2488 2489 2490
		} else {
			btrfs_set_node_blockptr(upper->eb, slot,
						node->eb->start);
			btrfs_set_node_ptr_generation(upper->eb, slot,
						      trans->transid);
			btrfs_mark_buffer_dirty(upper->eb);

2491 2492 2493 2494
			btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
					       node->eb->start, blocksize,
					       upper->eb->start);
			btrfs_init_tree_ref(&ref, node->level,
2495 2496
					    btrfs_header_owner(upper->eb),
					    root->root_key.objectid, false);
2497
			ret = btrfs_inc_extent_ref(trans, &ref);
2498 2499 2500 2501 2502
			if (!ret)
				ret = btrfs_drop_subtree(trans, root, eb,
							 upper->eb);
			if (ret)
				btrfs_abort_transaction(trans, ret);
2503
		}
2504 2505
next:
		if (!upper->pending)
2506
			btrfs_backref_drop_node_buffer(upper);
2507
		else
2508
			btrfs_backref_unlock_node_buffer(upper);
2509
		if (ret)
2510
			break;
2511
	}
2512

2513
	if (!ret && node->pending) {
2514
		btrfs_backref_drop_node_buffer(node);
2515 2516 2517 2518
		list_move_tail(&node->list, &rc->backref_cache.changed);
		node->pending = 0;
	}

2519
	path->lowest_level = 0;
2520 2521 2522 2523 2524 2525

	/*
	 * We should have allocated all of our space in the block rsv and thus
	 * shouldn't ENOSPC.
	 */
	ASSERT(ret != -ENOSPC);
2526
	return ret;
2527 2528 2529
}

static int link_to_upper(struct btrfs_trans_handle *trans,
2530
			 struct reloc_control *rc,
2531
			 struct btrfs_backref_node *node,
2532 2533 2534 2535 2536
			 struct btrfs_path *path)
{
	struct btrfs_key key;

	btrfs_node_key_to_cpu(node->eb, &key, 0);
2537
	return do_relocation(trans, rc, node, &key, path, 0);
2538 2539 2540
}

static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2541 2542
				struct reloc_control *rc,
				struct btrfs_path *path, int err)
2543
{
2544
	LIST_HEAD(list);
2545 2546
	struct btrfs_backref_cache *cache = &rc->backref_cache;
	struct btrfs_backref_node *node;
2547 2548 2549 2550 2551 2552
	int level;
	int ret;

	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
		while (!list_empty(&cache->pending[level])) {
			node = list_entry(cache->pending[level].next,
2553
					  struct btrfs_backref_node, list);
2554 2555
			list_move_tail(&node->list, &list);
			BUG_ON(!node->pending);
2556

2557 2558 2559 2560 2561
			if (!err) {
				ret = link_to_upper(trans, rc, node, path);
				if (ret < 0)
					err = ret;
			}
2562
		}
2563
		list_splice_init(&list, &cache->pending[level]);
2564 2565 2566 2567 2568 2569 2570 2571 2572
	}
	return err;
}

/*
 * mark a block and all blocks directly/indirectly reference the block
 * as processed.
 */
static void update_processed_blocks(struct reloc_control *rc,
2573
				    struct btrfs_backref_node *node)
2574
{
2575 2576 2577
	struct btrfs_backref_node *next = node;
	struct btrfs_backref_edge *edge;
	struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2578 2579 2580 2581 2582 2583 2584 2585
	int index = 0;

	while (next) {
		cond_resched();
		while (1) {
			if (next->processed)
				break;

2586
			mark_block_processed(rc, next);
2587 2588 2589 2590 2591

			if (list_empty(&next->upper))
				break;

			edge = list_entry(next->upper.next,
2592
					struct btrfs_backref_edge, list[LOWER]);
2593 2594 2595 2596 2597 2598 2599
			edges[index++] = edge;
			next = edge->node[UPPER];
		}
		next = walk_down_backref(edges, &index);
	}
}

2600
static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2601
{
2602
	u32 blocksize = rc->extent_root->fs_info->nodesize;
2603

2604 2605 2606 2607
	if (test_range_bit(&rc->processed_blocks, bytenr,
			   bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
		return 1;
	return 0;
2608 2609
}

2610
static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2611 2612 2613 2614
			      struct tree_block *block)
{
	struct extent_buffer *eb;

2615 2616
	eb = read_tree_block(fs_info, block->bytenr, block->owner,
			     block->key.offset, block->level, NULL);
2617
	if (IS_ERR(eb))
2618
		return PTR_ERR(eb);
2619
	if (!extent_buffer_uptodate(eb)) {
2620 2621 2622
		free_extent_buffer(eb);
		return -EIO;
	}
2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636
	if (block->level == 0)
		btrfs_item_key_to_cpu(eb, &block->key, 0);
	else
		btrfs_node_key_to_cpu(eb, &block->key, 0);
	free_extent_buffer(eb);
	block->key_ready = 1;
	return 0;
}

/*
 * helper function to relocate a tree block
 */
static int relocate_tree_block(struct btrfs_trans_handle *trans,
				struct reloc_control *rc,
2637
				struct btrfs_backref_node *node,
2638 2639 2640 2641
				struct btrfs_key *key,
				struct btrfs_path *path)
{
	struct btrfs_root *root;
2642 2643 2644 2645
	int ret = 0;

	if (!node)
		return 0;
2646

2647 2648 2649 2650 2651 2652 2653 2654
	/*
	 * If we fail here we want to drop our backref_node because we are going
	 * to start over and regenerate the tree for it.
	 */
	ret = reserve_metadata_space(trans, rc, node);
	if (ret)
		goto out;

2655
	BUG_ON(node->processed);
2656
	root = select_one_root(node);
2657 2658 2659 2660 2661 2662 2663 2664 2665
	if (IS_ERR(root)) {
		ret = PTR_ERR(root);

		/* See explanation in select_one_root for the -EUCLEAN case. */
		ASSERT(ret == -ENOENT);
		if (ret == -ENOENT) {
			ret = 0;
			update_processed_blocks(rc, node);
		}
2666
		goto out;
2667 2668
	}

2669
	if (root) {
2670
		if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692
			/*
			 * This block was the root block of a root, and this is
			 * the first time we're processing the block and thus it
			 * should not have had the ->new_bytenr modified and
			 * should have not been included on the changed list.
			 *
			 * However in the case of corruption we could have
			 * multiple refs pointing to the same block improperly,
			 * and thus we would trip over these checks.  ASSERT()
			 * for the developer case, because it could indicate a
			 * bug in the backref code, however error out for a
			 * normal user in the case of corruption.
			 */
			ASSERT(node->new_bytenr == 0);
			ASSERT(list_empty(&node->list));
			if (node->new_bytenr || !list_empty(&node->list)) {
				btrfs_err(root->fs_info,
				  "bytenr %llu has improper references to it",
					  node->bytenr);
				ret = -EUCLEAN;
				goto out;
			}
2693 2694 2695
			ret = btrfs_record_root_in_trans(trans, root);
			if (ret)
				goto out;
2696 2697 2698 2699 2700 2701 2702 2703
			/*
			 * Another thread could have failed, need to check if we
			 * have reloc_root actually set.
			 */
			if (!root->reloc_root) {
				ret = -ENOENT;
				goto out;
			}
2704 2705
			root = root->reloc_root;
			node->new_bytenr = root->node->start;
2706 2707
			btrfs_put_root(node->root);
			node->root = btrfs_grab_root(root);
2708
			ASSERT(node->root);
2709 2710 2711
			list_add_tail(&node->list, &rc->backref_cache.changed);
		} else {
			path->lowest_level = node->level;
2712 2713
			if (root == root->fs_info->chunk_root)
				btrfs_reserve_chunk_metadata(trans, false);
2714
			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2715
			btrfs_release_path(path);
2716 2717
			if (root == root->fs_info->chunk_root)
				btrfs_trans_release_chunk_metadata(trans);
2718 2719 2720 2721 2722 2723 2724 2725
			if (ret > 0)
				ret = 0;
		}
		if (!ret)
			update_processed_blocks(rc, node);
	} else {
		ret = do_relocation(trans, rc, node, key, path, 1);
	}
2726
out:
2727
	if (ret || node->level == 0 || node->cowonly)
2728
		btrfs_backref_cleanup_node(&rc->backref_cache, node);
2729 2730 2731 2732 2733 2734 2735 2736 2737 2738
	return ret;
}

/*
 * relocate a list of blocks
 */
static noinline_for_stack
int relocate_tree_blocks(struct btrfs_trans_handle *trans,
			 struct reloc_control *rc, struct rb_root *blocks)
{
2739
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2740
	struct btrfs_backref_node *node;
2741 2742
	struct btrfs_path *path;
	struct tree_block *block;
2743
	struct tree_block *next;
2744 2745 2746 2747
	int ret;
	int err = 0;

	path = btrfs_alloc_path();
2748 2749
	if (!path) {
		err = -ENOMEM;
2750
		goto out_free_blocks;
2751
	}
2752

2753 2754
	/* Kick in readahead for tree blocks with missing keys */
	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2755
		if (!block->key_ready)
2756 2757
			btrfs_readahead_tree_block(fs_info, block->bytenr,
						   block->owner, 0,
2758
						   block->level);
2759 2760
	}

2761 2762
	/* Get first keys */
	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2763
		if (!block->key_ready) {
2764
			err = get_tree_block_key(fs_info, block);
2765 2766 2767
			if (err)
				goto out_free_path;
		}
2768 2769
	}

2770 2771
	/* Do tree relocation */
	rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2772
		node = build_backref_tree(rc, &block->key,
2773 2774 2775 2776 2777 2778 2779 2780 2781
					  block->level, block->bytenr);
		if (IS_ERR(node)) {
			err = PTR_ERR(node);
			goto out;
		}

		ret = relocate_tree_block(trans, rc, node, &block->key,
					  path);
		if (ret < 0) {
2782 2783
			err = ret;
			break;
2784 2785 2786
		}
	}
out:
2787
	err = finish_pending_nodes(trans, rc, path, err);
2788

2789
out_free_path:
2790
	btrfs_free_path(path);
2791
out_free_blocks:
2792
	free_block_list(blocks);
2793 2794 2795
	return err;
}

2796 2797 2798
static noinline_for_stack int prealloc_file_extent_cluster(
				struct btrfs_inode *inode,
				struct file_extent_cluster *cluster)
2799 2800 2801 2802
{
	u64 alloc_hint = 0;
	u64 start;
	u64 end;
2803
	u64 offset = inode->index_cnt;
2804
	u64 num_bytes;
2805
	int nr;
2806
	int ret = 0;
2807
	u64 i_size = i_size_read(&inode->vfs_inode);
2808 2809
	u64 prealloc_start = cluster->start - offset;
	u64 prealloc_end = cluster->end - offset;
2810
	u64 cur_offset = prealloc_start;
2811

2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835
	/*
	 * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
	 * This means the range [i_size, PAGE_END + 1) is filled with zeros by
	 * btrfs_do_readpage() call of previously relocated file cluster.
	 *
	 * If the current cluster starts in the above range, btrfs_do_readpage()
	 * will skip the read, and relocate_one_page() will later writeback
	 * the padding zeros as new data, causing data corruption.
	 *
	 * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
	 */
	if (!IS_ALIGNED(i_size, PAGE_SIZE)) {
		struct address_space *mapping = inode->vfs_inode.i_mapping;
		struct btrfs_fs_info *fs_info = inode->root->fs_info;
		const u32 sectorsize = fs_info->sectorsize;
		struct page *page;

		ASSERT(sectorsize < PAGE_SIZE);
		ASSERT(IS_ALIGNED(i_size, sectorsize));

		/*
		 * Subpage can't handle page with DIRTY but without UPTODATE
		 * bit as it can lead to the following deadlock:
		 *
2836
		 * btrfs_read_folio()
2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870
		 * | Page already *locked*
		 * |- btrfs_lock_and_flush_ordered_range()
		 *    |- btrfs_start_ordered_extent()
		 *       |- extent_write_cache_pages()
		 *          |- lock_page()
		 *             We try to lock the page we already hold.
		 *
		 * Here we just writeback the whole data reloc inode, so that
		 * we will be ensured to have no dirty range in the page, and
		 * are safe to clear the uptodate bits.
		 *
		 * This shouldn't cause too much overhead, as we need to write
		 * the data back anyway.
		 */
		ret = filemap_write_and_wait(mapping);
		if (ret < 0)
			return ret;

		clear_extent_bits(&inode->io_tree, i_size,
				  round_up(i_size, PAGE_SIZE) - 1,
				  EXTENT_UPTODATE);
		page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
		/*
		 * If page is freed we don't need to do anything then, as we
		 * will re-read the whole page anyway.
		 */
		if (page) {
			btrfs_subpage_clear_uptodate(fs_info, page, i_size,
					round_up(i_size, PAGE_SIZE) - i_size);
			unlock_page(page);
			put_page(page);
		}
	}

2871
	BUG_ON(cluster->start != cluster->boundary[0]);
2872
	ret = btrfs_alloc_data_chunk_ondemand(inode,
2873
					      prealloc_end + 1 - prealloc_start);
2874
	if (ret)
2875
		return ret;
2876

2877
	btrfs_inode_lock(&inode->vfs_inode, 0);
2878
	for (nr = 0; nr < cluster->nr; nr++) {
2879 2880
		struct extent_state *cached_state = NULL;

2881 2882 2883 2884 2885 2886
		start = cluster->boundary[nr] - offset;
		if (nr + 1 < cluster->nr)
			end = cluster->boundary[nr + 1] - 1 - offset;
		else
			end = cluster->end - offset;

2887
		lock_extent(&inode->io_tree, start, end, &cached_state);
2888
		num_bytes = end + 1 - start;
2889
		ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2890 2891
						num_bytes, num_bytes,
						end + 1, &alloc_hint);
2892
		cur_offset = end + 1;
2893
		unlock_extent(&inode->io_tree, start, end, &cached_state);
2894 2895 2896
		if (ret)
			break;
	}
2897
	btrfs_inode_unlock(&inode->vfs_inode, 0);
2898

2899
	if (cur_offset < prealloc_end)
2900
		btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2901
					       prealloc_end + 1 - cur_offset);
2902 2903 2904
	return ret;
}

2905 2906
static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
				u64 start, u64 end, u64 block_start)
2907 2908
{
	struct extent_map *em;
2909
	struct extent_state *cached_state = NULL;
2910 2911
	int ret = 0;

2912
	em = alloc_extent_map();
2913 2914 2915 2916 2917 2918 2919 2920 2921
	if (!em)
		return -ENOMEM;

	em->start = start;
	em->len = end + 1 - start;
	em->block_len = em->len;
	em->block_start = block_start;
	set_bit(EXTENT_FLAG_PINNED, &em->flags);

2922
	lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2923
	ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
2924
	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2925 2926
	free_extent_map(em);

2927 2928 2929
	return ret;
}

2930
/*
2931
 * Allow error injection to test balance/relocation cancellation
2932
 */
2933
noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info)
2934
{
2935
	return atomic_read(&fs_info->balance_cancel_req) ||
2936
		atomic_read(&fs_info->reloc_cancel_req) ||
2937
		fatal_signal_pending(current);
2938 2939 2940
}
ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);

2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951
static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster,
				    int cluster_nr)
{
	/* Last extent, use cluster end directly */
	if (cluster_nr >= cluster->nr - 1)
		return cluster->end;

	/* Use next boundary start*/
	return cluster->boundary[cluster_nr + 1] - 1;
}

2952 2953 2954
static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
			     struct file_extent_cluster *cluster,
			     int *cluster_nr, unsigned long page_index)
2955
{
2956
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2957 2958 2959 2960
	u64 offset = BTRFS_I(inode)->index_cnt;
	const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
	struct page *page;
2961 2962
	u64 page_start;
	u64 page_end;
2963
	u64 cur;
2964 2965 2966 2967 2968 2969 2970 2971
	int ret;

	ASSERT(page_index <= last_index);
	page = find_lock_page(inode->i_mapping, page_index);
	if (!page) {
		page_cache_sync_readahead(inode->i_mapping, ra, NULL,
				page_index, last_index + 1 - page_index);
		page = find_or_create_page(inode->i_mapping, page_index, mask);
2972 2973
		if (!page)
			return -ENOMEM;
2974 2975 2976 2977 2978 2979
	}
	ret = set_page_extent_mapped(page);
	if (ret < 0)
		goto release_page;

	if (PageReadahead(page))
2980 2981 2982
		page_cache_async_readahead(inode->i_mapping, ra, NULL,
				page_folio(page), page_index,
				last_index + 1 - page_index);
2983 2984

	if (!PageUptodate(page)) {
2985
		btrfs_read_folio(NULL, page_folio(page));
2986 2987 2988 2989 2990 2991 2992 2993 2994 2995
		lock_page(page);
		if (!PageUptodate(page)) {
			ret = -EIO;
			goto release_page;
		}
	}

	page_start = page_offset(page);
	page_end = page_start + PAGE_SIZE - 1;

2996 2997 2998 2999 3000 3001
	/*
	 * Start from the cluster, as for subpage case, the cluster can start
	 * inside the page.
	 */
	cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
	while (cur <= page_end) {
3002
		struct extent_state *cached_state = NULL;
3003 3004 3005 3006 3007 3008 3009 3010 3011
		u64 extent_start = cluster->boundary[*cluster_nr] - offset;
		u64 extent_end = get_cluster_boundary_end(cluster,
						*cluster_nr) - offset;
		u64 clamped_start = max(page_start, extent_start);
		u64 clamped_end = min(page_end, extent_end);
		u32 clamped_len = clamped_end + 1 - clamped_start;

		/* Reserve metadata for this range */
		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3012 3013
						      clamped_len, clamped_len,
						      false);
3014 3015
		if (ret)
			goto release_page;
3016

3017
		/* Mark the range delalloc and dirty for later writeback */
3018 3019
		lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
			    &cached_state);
3020
		ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
3021
						clamped_end, 0, &cached_state);
3022
		if (ret) {
3023 3024 3025 3026
			clear_extent_bit(&BTRFS_I(inode)->io_tree,
					 clamped_start, clamped_end,
					 EXTENT_LOCKED | EXTENT_BOUNDARY,
					 &cached_state);
3027 3028 3029 3030 3031 3032 3033
			btrfs_delalloc_release_metadata(BTRFS_I(inode),
							clamped_len, true);
			btrfs_delalloc_release_extents(BTRFS_I(inode),
						       clamped_len);
			goto release_page;
		}
		btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
3034

3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052
		/*
		 * Set the boundary if it's inside the page.
		 * Data relocation requires the destination extents to have the
		 * same size as the source.
		 * EXTENT_BOUNDARY bit prevents current extent from being merged
		 * with previous extent.
		 */
		if (in_range(cluster->boundary[*cluster_nr] - offset,
			     page_start, PAGE_SIZE)) {
			u64 boundary_start = cluster->boundary[*cluster_nr] -
						offset;
			u64 boundary_end = boundary_start +
					   fs_info->sectorsize - 1;

			set_extent_bits(&BTRFS_I(inode)->io_tree,
					boundary_start, boundary_end,
					EXTENT_BOUNDARY);
		}
3053 3054
		unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
			      &cached_state);
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064
		btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
		cur += clamped_len;

		/* Crossed extent end, go to next extent */
		if (cur >= extent_end) {
			(*cluster_nr)++;
			/* Just finished the last extent of the cluster, exit. */
			if (*cluster_nr >= cluster->nr)
				break;
		}
3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083
	}
	unlock_page(page);
	put_page(page);

	balance_dirty_pages_ratelimited(inode->i_mapping);
	btrfs_throttle(fs_info);
	if (btrfs_should_cancel_balance(fs_info))
		ret = -ECANCELED;
	return ret;

release_page:
	unlock_page(page);
	put_page(page);
	return ret;
}

static int relocate_file_extent_cluster(struct inode *inode,
					struct file_extent_cluster *cluster)
{
3084 3085
	u64 offset = BTRFS_I(inode)->index_cnt;
	unsigned long index;
3086 3087
	unsigned long last_index;
	struct file_ra_state *ra;
3088
	int cluster_nr = 0;
3089 3090
	int ret = 0;

3091 3092 3093
	if (!cluster->nr)
		return 0;

3094 3095 3096 3097
	ra = kzalloc(sizeof(*ra), GFP_NOFS);
	if (!ra)
		return -ENOMEM;

3098
	ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
3099 3100
	if (ret)
		goto out;
3101

3102
	file_ra_state_init(ra, inode->i_mapping);
3103

3104
	ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
3105
				   cluster->end - offset, cluster->start);
3106
	if (ret)
3107
		goto out;
3108

3109
	last_index = (cluster->end - offset) >> PAGE_SHIFT;
3110 3111 3112 3113 3114
	for (index = (cluster->start - offset) >> PAGE_SHIFT;
	     index <= last_index && !ret; index++)
		ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
	if (ret == 0)
		WARN_ON(cluster_nr != cluster->nr);
3115
out:
3116 3117 3118 3119 3120
	kfree(ra);
	return ret;
}

static noinline_for_stack
3121 3122
int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key,
			 struct file_extent_cluster *cluster)
3123
{
3124
	int ret;
3125

3126 3127 3128 3129 3130
	if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
		ret = relocate_file_extent_cluster(inode, cluster);
		if (ret)
			return ret;
		cluster->nr = 0;
3131 3132
	}

3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147
	if (!cluster->nr)
		cluster->start = extent_key->objectid;
	else
		BUG_ON(cluster->nr >= MAX_EXTENTS);
	cluster->end = extent_key->objectid + extent_key->offset - 1;
	cluster->boundary[cluster->nr] = extent_key->objectid;
	cluster->nr++;

	if (cluster->nr >= MAX_EXTENTS) {
		ret = relocate_file_extent_cluster(inode, cluster);
		if (ret)
			return ret;
		cluster->nr = 0;
	}
	return 0;
3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165
}

/*
 * helper to add a tree block to the list.
 * the major work is getting the generation and level of the block
 */
static int add_tree_block(struct reloc_control *rc,
			  struct btrfs_key *extent_key,
			  struct btrfs_path *path,
			  struct rb_root *blocks)
{
	struct extent_buffer *eb;
	struct btrfs_extent_item *ei;
	struct btrfs_tree_block_info *bi;
	struct tree_block *block;
	struct rb_node *rb_node;
	u32 item_size;
	int level = -1;
3166
	u64 generation;
3167
	u64 owner = 0;
3168 3169

	eb =  path->nodes[0];
3170
	item_size = btrfs_item_size(eb, path->slots[0]);
3171

3172 3173
	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
	    item_size >= sizeof(*ei) + sizeof(*bi)) {
3174 3175
		unsigned long ptr = 0, end;

3176 3177
		ei = btrfs_item_ptr(eb, path->slots[0],
				struct btrfs_extent_item);
3178
		end = (unsigned long)ei + item_size;
3179 3180 3181
		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
			bi = (struct btrfs_tree_block_info *)(ei + 1);
			level = btrfs_tree_block_level(eb, bi);
3182
			ptr = (unsigned long)(bi + 1);
3183 3184
		} else {
			level = (int)extent_key->offset;
3185
			ptr = (unsigned long)(ei + 1);
3186
		}
3187
		generation = btrfs_extent_generation(eb, ei);
3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218

		/*
		 * We're reading random blocks without knowing their owner ahead
		 * of time.  This is ok most of the time, as all reloc roots and
		 * fs roots have the same lock type.  However normal trees do
		 * not, and the only way to know ahead of time is to read the
		 * inline ref offset.  We know it's an fs root if
		 *
		 * 1. There's more than one ref.
		 * 2. There's a SHARED_DATA_REF_KEY set.
		 * 3. FULL_BACKREF is set on the flags.
		 *
		 * Otherwise it's safe to assume that the ref offset == the
		 * owner of this block, so we can use that when calling
		 * read_tree_block.
		 */
		if (btrfs_extent_refs(eb, ei) == 1 &&
		    !(btrfs_extent_flags(eb, ei) &
		      BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
		    ptr < end) {
			struct btrfs_extent_inline_ref *iref;
			int type;

			iref = (struct btrfs_extent_inline_ref *)ptr;
			type = btrfs_get_extent_inline_ref_type(eb, iref,
							BTRFS_REF_TYPE_BLOCK);
			if (type == BTRFS_REF_TYPE_INVALID)
				return -EINVAL;
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
				owner = btrfs_extent_inline_ref_offset(eb, iref);
		}
3219
	} else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
3220 3221 3222
		btrfs_print_v0_err(eb->fs_info);
		btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL);
		return -EINVAL;
3223 3224 3225 3226
	} else {
		BUG();
	}

3227
	btrfs_release_path(path);
3228 3229 3230 3231 3232 3233 3234 3235

	BUG_ON(level == -1);

	block = kmalloc(sizeof(*block), GFP_NOFS);
	if (!block)
		return -ENOMEM;

	block->bytenr = extent_key->objectid;
3236
	block->key.objectid = rc->extent_root->fs_info->nodesize;
3237 3238 3239
	block->key.offset = generation;
	block->level = level;
	block->key_ready = 0;
3240
	block->owner = owner;
3241

3242
	rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3243
	if (rb_node)
3244 3245
		btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
				    -EEXIST);
3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256

	return 0;
}

/*
 * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
 */
static int __add_tree_block(struct reloc_control *rc,
			    u64 bytenr, u32 blocksize,
			    struct rb_root *blocks)
{
3257
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3258 3259 3260
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret;
3261
	bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3262

3263
	if (tree_block_processed(bytenr, rc))
3264 3265
		return 0;

3266
	if (rb_simple_search(blocks, bytenr))
3267 3268 3269 3270 3271
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
3272
again:
3273
	key.objectid = bytenr;
3274 3275 3276 3277 3278 3279 3280
	if (skinny) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = (u64)-1;
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
		key.offset = blocksize;
	}
3281 3282 3283 3284 3285 3286 3287

	path->search_commit_root = 1;
	path->skip_locking = 1;
	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304
	if (ret > 0 && skinny) {
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    (key.type == BTRFS_METADATA_ITEM_KEY ||
			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
			      key.offset == blocksize)))
				ret = 0;
		}

		if (ret) {
			skinny = false;
			btrfs_release_path(path);
			goto again;
		}
3305
	}
3306 3307 3308 3309 3310 3311 3312 3313 3314 3315
	if (ret) {
		ASSERT(ret == 1);
		btrfs_print_leaf(path->nodes[0]);
		btrfs_err(fs_info,
	     "tree block extent item (%llu) is not found in extent tree",
		     bytenr);
		WARN_ON(1);
		ret = -EINVAL;
		goto out;
	}
3316

3317 3318 3319 3320 3321 3322
	ret = add_tree_block(rc, &key, path, blocks);
out:
	btrfs_free_path(path);
	return ret;
}

3323
static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3324
				    struct btrfs_block_group *block_group,
3325 3326
				    struct inode *inode,
				    u64 ino)
3327 3328 3329 3330 3331 3332 3333 3334
{
	struct btrfs_root *root = fs_info->tree_root;
	struct btrfs_trans_handle *trans;
	int ret = 0;

	if (inode)
		goto truncate;

D
David Sterba 已提交
3335
	inode = btrfs_iget(fs_info->sb, ino, root);
3336
	if (IS_ERR(inode))
3337 3338 3339
		return -ENOENT;

truncate:
3340
	ret = btrfs_check_trunc_cache_free_space(fs_info,
3341 3342 3343 3344
						 &fs_info->global_block_rsv);
	if (ret)
		goto out;

3345
	trans = btrfs_join_transaction(root);
3346
	if (IS_ERR(trans)) {
3347
		ret = PTR_ERR(trans);
3348 3349 3350
		goto out;
	}

3351
	ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3352

3353
	btrfs_end_transaction(trans);
3354
	btrfs_btree_balance_dirty(fs_info);
3355 3356 3357 3358 3359
out:
	iput(inode);
	return ret;
}

3360
/*
3361 3362
 * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
 * cache inode, to avoid free space cache data extent blocking data relocation.
3363
 */
3364 3365 3366
static int delete_v1_space_cache(struct extent_buffer *leaf,
				 struct btrfs_block_group *block_group,
				 u64 data_bytenr)
3367
{
3368 3369
	u64 space_cache_ino;
	struct btrfs_file_extent_item *ei;
3370
	struct btrfs_key key;
3371 3372
	bool found = false;
	int i;
3373 3374
	int ret;

3375 3376
	if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
		return 0;
3377

3378
	for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3379 3380
		u8 type;

3381 3382 3383 3384
		btrfs_item_key_to_cpu(leaf, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3385 3386 3387 3388
		type = btrfs_file_extent_type(leaf, ei);

		if ((type == BTRFS_FILE_EXTENT_REG ||
		     type == BTRFS_FILE_EXTENT_PREALLOC) &&
3389 3390 3391
		    btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
			found = true;
			space_cache_ino = key.objectid;
3392 3393 3394
			break;
		}
	}
3395 3396 3397 3398 3399
	if (!found)
		return -ENOENT;
	ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
					space_cache_ino);
	return ret;
3400 3401 3402
}

/*
L
Liu Bo 已提交
3403
 * helper to find all tree blocks that reference a given data extent
3404 3405 3406 3407 3408 3409 3410
 */
static noinline_for_stack
int add_data_references(struct reloc_control *rc,
			struct btrfs_key *extent_key,
			struct btrfs_path *path,
			struct rb_root *blocks)
{
3411 3412 3413 3414 3415
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
	struct ulist *leaves = NULL;
	struct ulist_iterator leaf_uiter;
	struct ulist_node *ref_node = NULL;
	const u32 blocksize = fs_info->nodesize;
3416
	int ret = 0;
3417

3418 3419
	btrfs_release_path(path);
	ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
3420
				   0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET);
3421 3422
	if (ret < 0)
		return ret;
3423

3424 3425 3426
	ULIST_ITER_INIT(&leaf_uiter);
	while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
		struct extent_buffer *eb;
3427

3428
		eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
3429 3430
		if (IS_ERR(eb)) {
			ret = PTR_ERR(eb);
3431 3432
			break;
		}
3433 3434 3435 3436 3437 3438 3439
		ret = delete_v1_space_cache(eb, rc->block_group,
					    extent_key->objectid);
		free_extent_buffer(eb);
		if (ret < 0)
			break;
		ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
		if (ret < 0)
3440 3441
			break;
	}
3442
	if (ret < 0)
3443
		free_block_list(blocks);
3444 3445
	ulist_free(leaves);
	return ret;
3446 3447 3448
}

/*
L
Liu Bo 已提交
3449
 * helper to find next unprocessed extent
3450 3451
 */
static noinline_for_stack
3452
int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3453
		     struct btrfs_key *extent_key)
3454
{
3455
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3456 3457 3458 3459 3460
	struct btrfs_key key;
	struct extent_buffer *leaf;
	u64 start, end, last;
	int ret;

3461
	last = rc->block_group->start + rc->block_group->length;
3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
	while (1) {
		cond_resched();
		if (rc->search_start >= last) {
			ret = 1;
			break;
		}

		key.objectid = rc->search_start;
		key.type = BTRFS_EXTENT_ITEM_KEY;
		key.offset = 0;

		path->search_commit_root = 1;
		path->skip_locking = 1;
		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
					0, 0);
		if (ret < 0)
			break;
next:
		leaf = path->nodes[0];
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(rc->extent_root, path);
			if (ret != 0)
				break;
			leaf = path->nodes[0];
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid >= last) {
			ret = 1;
			break;
		}

3494 3495 3496 3497 3498 3499 3500
		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
		    key.type != BTRFS_METADATA_ITEM_KEY) {
			path->slots[0]++;
			goto next;
		}

		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3501 3502 3503 3504 3505
		    key.objectid + key.offset <= rc->search_start) {
			path->slots[0]++;
			goto next;
		}

3506
		if (key.type == BTRFS_METADATA_ITEM_KEY &&
3507
		    key.objectid + fs_info->nodesize <=
3508 3509 3510 3511 3512
		    rc->search_start) {
			path->slots[0]++;
			goto next;
		}

3513 3514
		ret = find_first_extent_bit(&rc->processed_blocks,
					    key.objectid, &start, &end,
3515
					    EXTENT_DIRTY, NULL);
3516 3517

		if (ret == 0 && start <= key.objectid) {
3518
			btrfs_release_path(path);
3519 3520
			rc->search_start = end + 1;
		} else {
3521 3522 3523 3524
			if (key.type == BTRFS_EXTENT_ITEM_KEY)
				rc->search_start = key.objectid + key.offset;
			else
				rc->search_start = key.objectid +
3525
					fs_info->nodesize;
3526
			memcpy(extent_key, &key, sizeof(key));
3527 3528 3529
			return 0;
		}
	}
3530
	btrfs_release_path(path);
3531 3532 3533 3534 3535 3536
	return ret;
}

static void set_reloc_control(struct reloc_control *rc)
{
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
C
Chris Mason 已提交
3537 3538

	mutex_lock(&fs_info->reloc_mutex);
3539
	fs_info->reloc_ctl = rc;
C
Chris Mason 已提交
3540
	mutex_unlock(&fs_info->reloc_mutex);
3541 3542 3543 3544 3545
}

static void unset_reloc_control(struct reloc_control *rc)
{
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
C
Chris Mason 已提交
3546 3547

	mutex_lock(&fs_info->reloc_mutex);
3548
	fs_info->reloc_ctl = NULL;
C
Chris Mason 已提交
3549
	mutex_unlock(&fs_info->reloc_mutex);
3550 3551
}

3552 3553 3554 3555
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
	struct btrfs_trans_handle *trans;
3556
	int ret;
3557

3558
	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3559
					      BTRFS_BLOCK_RSV_TEMP);
3560 3561 3562 3563
	if (!rc->block_rsv)
		return -ENOMEM;

	memset(&rc->cluster, 0, sizeof(rc->cluster));
3564
	rc->search_start = rc->block_group->start;
3565 3566 3567
	rc->extents_found = 0;
	rc->nodes_relocated = 0;
	rc->merging_rsv_size = 0;
3568
	rc->reserved_bytes = 0;
3569
	rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3570
			      RELOCATION_RESERVED_NODES;
3571
	ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
3572 3573 3574 3575
				     rc->block_rsv, rc->block_rsv->size,
				     BTRFS_RESERVE_FLUSH_ALL);
	if (ret)
		return ret;
3576 3577 3578 3579

	rc->create_reloc_tree = 1;
	set_reloc_control(rc);

3580
	trans = btrfs_join_transaction(rc->extent_root);
3581 3582 3583 3584 3585 3586 3587 3588 3589
	if (IS_ERR(trans)) {
		unset_reloc_control(rc);
		/*
		 * extent tree is not a ref_cow tree and has no reloc_root to
		 * cleanup.  And callers are responsible to free the above
		 * block rsv.
		 */
		return PTR_ERR(trans);
	}
3590 3591 3592 3593 3594 3595

	ret = btrfs_commit_transaction(trans);
	if (ret)
		unset_reloc_control(rc);

	return ret;
3596
}
3597

3598 3599
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
{
3600
	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3601 3602 3603 3604 3605 3606 3607 3608
	struct rb_root blocks = RB_ROOT;
	struct btrfs_key key;
	struct btrfs_trans_handle *trans = NULL;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	u64 flags;
	int ret;
	int err = 0;
3609
	int progress = 0;
3610 3611

	path = btrfs_alloc_path();
3612
	if (!path)
3613
		return -ENOMEM;
3614
	path->reada = READA_FORWARD;
3615

3616 3617 3618 3619 3620
	ret = prepare_to_relocate(rc);
	if (ret) {
		err = ret;
		goto out_free;
	}
3621 3622

	while (1) {
3623
		rc->reserved_bytes = 0;
3624 3625 3626
		ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
					     rc->block_rsv->size,
					     BTRFS_RESERVE_FLUSH_ALL);
3627 3628 3629 3630
		if (ret) {
			err = ret;
			break;
		}
3631
		progress++;
3632
		trans = btrfs_start_transaction(rc->extent_root, 0);
3633 3634 3635 3636 3637
		if (IS_ERR(trans)) {
			err = PTR_ERR(trans);
			trans = NULL;
			break;
		}
3638
restart:
3639
		if (update_backref_cache(trans, &rc->backref_cache)) {
3640
			btrfs_end_transaction(trans);
3641
			trans = NULL;
3642 3643 3644
			continue;
		}

3645
		ret = find_next_extent(rc, path, &key);
3646 3647 3648 3649 3650 3651 3652 3653 3654
		if (ret < 0)
			err = ret;
		if (ret != 0)
			break;

		rc->extents_found++;

		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				    struct btrfs_extent_item);
3655
		flags = btrfs_extent_flags(path->nodes[0], ei);
3656 3657 3658 3659

		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
			ret = add_tree_block(rc, &key, path, &blocks);
		} else if (rc->stage == UPDATE_DATA_PTRS &&
3660
			   (flags & BTRFS_EXTENT_FLAG_DATA)) {
3661 3662
			ret = add_data_references(rc, &key, path, &blocks);
		} else {
3663
			btrfs_release_path(path);
3664 3665 3666
			ret = 0;
		}
		if (ret < 0) {
3667
			err = ret;
3668 3669 3670 3671 3672 3673
			break;
		}

		if (!RB_EMPTY_ROOT(&blocks)) {
			ret = relocate_tree_blocks(trans, rc, &blocks);
			if (ret < 0) {
3674 3675 3676 3677 3678 3679 3680 3681 3682
				if (ret != -EAGAIN) {
					err = ret;
					break;
				}
				rc->extents_found--;
				rc->search_start = key.objectid;
			}
		}

3683
		btrfs_end_transaction_throttle(trans);
3684
		btrfs_btree_balance_dirty(fs_info);
3685 3686 3687 3688 3689
		trans = NULL;

		if (rc->stage == MOVE_DATA_EXTENTS &&
		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
			rc->found_file_extent = 1;
3690
			ret = relocate_data_extent(rc->data_inode,
3691
						   &key, &rc->cluster);
3692 3693 3694 3695 3696
			if (ret < 0) {
				err = ret;
				break;
			}
		}
3697 3698 3699 3700
		if (btrfs_should_cancel_balance(fs_info)) {
			err = -ECANCELED;
			break;
		}
3701
	}
3702
	if (trans && progress && err == -ENOSPC) {
3703
		ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3704
		if (ret == 1) {
3705 3706 3707 3708 3709
			err = 0;
			progress = 0;
			goto restart;
		}
	}
3710

3711
	btrfs_release_path(path);
3712
	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3713 3714

	if (trans) {
3715
		btrfs_end_transaction_throttle(trans);
3716
		btrfs_btree_balance_dirty(fs_info);
3717 3718
	}

3719
	if (!err) {
3720 3721
		ret = relocate_file_extent_cluster(rc->data_inode,
						   &rc->cluster);
3722 3723 3724 3725
		if (ret < 0)
			err = ret;
	}

3726 3727
	rc->create_reloc_tree = 0;
	set_reloc_control(rc);
3728

3729
	btrfs_backref_release_cache(&rc->backref_cache);
3730
	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3731

3732 3733 3734 3735 3736 3737 3738 3739
	/*
	 * Even in the case when the relocation is cancelled, we should all go
	 * through prepare_to_merge() and merge_reloc_roots().
	 *
	 * For error (including cancelled balance), prepare_to_merge() will
	 * mark all reloc trees orphan, then queue them for cleanup in
	 * merge_reloc_roots()
	 */
3740
	err = prepare_to_merge(rc, err);
3741 3742 3743

	merge_reloc_roots(rc);

3744
	rc->merge_reloc_tree = 0;
3745
	unset_reloc_control(rc);
3746
	btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3747 3748

	/* get rid of pinned extents */
3749
	trans = btrfs_join_transaction(rc->extent_root);
3750
	if (IS_ERR(trans)) {
3751
		err = PTR_ERR(trans);
3752 3753
		goto out_free;
	}
3754 3755 3756
	ret = btrfs_commit_transaction(trans);
	if (ret && !err)
		err = ret;
3757
out_free:
3758 3759 3760
	ret = clean_dirty_subvols(rc);
	if (ret < 0 && !err)
		err = ret;
3761
	btrfs_free_block_rsv(fs_info, rc->block_rsv);
3762
	btrfs_free_path(path);
3763 3764 3765 3766
	return err;
}

static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3767
				 struct btrfs_root *root, u64 objectid)
3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783
{
	struct btrfs_path *path;
	struct btrfs_inode_item *item;
	struct extent_buffer *leaf;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3784
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3785
	btrfs_set_inode_generation(leaf, item, 1);
3786
	btrfs_set_inode_size(leaf, item, 0);
3787
	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3788 3789
	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
					  BTRFS_INODE_PREALLOC);
3790 3791 3792 3793 3794 3795
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return ret;
}

3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824
static void delete_orphan_inode(struct btrfs_trans_handle *trans,
				struct btrfs_root *root, u64 objectid)
{
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret = 0;

	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = objectid;
	key.type = BTRFS_INODE_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
		goto out;
	}
	ret = btrfs_del_item(trans, root, path);
out:
	if (ret)
		btrfs_abort_transaction(trans, ret);
	btrfs_free_path(path);
}

3825 3826 3827 3828
/*
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
3829 3830
static noinline_for_stack
struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3831
				 struct btrfs_block_group *group)
3832 3833 3834 3835
{
	struct inode *inode = NULL;
	struct btrfs_trans_handle *trans;
	struct btrfs_root *root;
3836
	u64 objectid;
3837 3838
	int err = 0;

3839
	root = btrfs_grab_root(fs_info->data_reloc_root);
3840
	trans = btrfs_start_transaction(root, 6);
3841
	if (IS_ERR(trans)) {
3842
		btrfs_put_root(root);
3843
		return ERR_CAST(trans);
3844
	}
3845

3846
	err = btrfs_get_free_objectid(root, &objectid);
3847 3848 3849
	if (err)
		goto out;

3850
	err = __insert_orphan_inode(trans, root, objectid);
3851 3852
	if (err)
		goto out;
3853

D
David Sterba 已提交
3854
	inode = btrfs_iget(fs_info->sb, objectid, root);
3855 3856 3857 3858 3859 3860
	if (IS_ERR(inode)) {
		delete_orphan_inode(trans, root, objectid);
		err = PTR_ERR(inode);
		inode = NULL;
		goto out;
	}
3861
	BTRFS_I(inode)->index_cnt = group->start;
3862

3863
	err = btrfs_orphan_add(trans, BTRFS_I(inode));
3864
out:
3865
	btrfs_put_root(root);
3866
	btrfs_end_transaction(trans);
3867
	btrfs_btree_balance_dirty(fs_info);
3868
	if (err) {
3869
		iput(inode);
3870 3871 3872 3873 3874
		inode = ERR_PTR(err);
	}
	return inode;
}

3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915
/*
 * Mark start of chunk relocation that is cancellable. Check if the cancellation
 * has been requested meanwhile and don't start in that case.
 *
 * Return:
 *   0             success
 *   -EINPROGRESS  operation is already in progress, that's probably a bug
 *   -ECANCELED    cancellation request was set before the operation started
 */
static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
{
	if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
		/* This should not happen */
		btrfs_err(fs_info, "reloc already running, cannot start");
		return -EINPROGRESS;
	}

	if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
		btrfs_info(fs_info, "chunk relocation canceled on start");
		/*
		 * On cancel, clear all requests but let the caller mark
		 * the end after cleanup operations.
		 */
		atomic_set(&fs_info->reloc_cancel_req, 0);
		return -ECANCELED;
	}
	return 0;
}

/*
 * Mark end of chunk relocation that is cancellable and wake any waiters.
 */
static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
{
	/* Requested after start, clear bit first so any waiters can continue */
	if (atomic_read(&fs_info->reloc_cancel_req) > 0)
		btrfs_info(fs_info, "chunk relocation canceled during operation");
	clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
	atomic_set(&fs_info->reloc_cancel_req, 0);
}

3916
static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
3917 3918 3919 3920 3921 3922 3923 3924
{
	struct reloc_control *rc;

	rc = kzalloc(sizeof(*rc), GFP_NOFS);
	if (!rc)
		return NULL;

	INIT_LIST_HEAD(&rc->reloc_roots);
3925
	INIT_LIST_HEAD(&rc->dirty_subvol_roots);
3926
	btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
3927
	mapping_tree_init(&rc->reloc_root_tree);
3928 3929
	extent_io_tree_init(fs_info, &rc->processed_blocks,
			    IO_TREE_RELOC_BLOCKS, NULL);
3930 3931 3932
	return rc;
}

3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944
static void free_reloc_control(struct reloc_control *rc)
{
	struct mapping_node *node, *tmp;

	free_reloc_roots(&rc->reloc_roots);
	rbtree_postorder_for_each_entry_safe(node, tmp,
			&rc->reloc_root_tree.rb_root, rb_node)
		kfree(node);

	kfree(rc);
}

3945 3946 3947 3948
/*
 * Print the block group being relocated
 */
static void describe_relocation(struct btrfs_fs_info *fs_info,
3949
				struct btrfs_block_group *block_group)
3950
{
3951
	char buf[128] = {'\0'};
3952

3953
	btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
3954 3955 3956

	btrfs_info(fs_info,
		   "relocating block group %llu flags %s",
3957
		   block_group->start, buf);
3958 3959
}

3960 3961 3962 3963 3964 3965 3966 3967 3968
static const char *stage_to_string(int stage)
{
	if (stage == MOVE_DATA_EXTENTS)
		return "move data extents";
	if (stage == UPDATE_DATA_PTRS)
		return "update data pointers";
	return "unknown";
}

3969 3970 3971
/*
 * function to relocate all extents in a block group.
 */
3972
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
3973
{
3974
	struct btrfs_block_group *bg;
3975
	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
3976
	struct reloc_control *rc;
3977 3978
	struct inode *inode;
	struct btrfs_path *path;
3979
	int ret;
3980
	int rw = 0;
3981 3982
	int err = 0;

3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995
	/*
	 * This only gets set if we had a half-deleted snapshot on mount.  We
	 * cannot allow relocation to start while we're still trying to clean up
	 * these pending deletions.
	 */
	ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
	if (ret)
		return ret;

	/* We may have been woken up by close_ctree, so bail if we're closing. */
	if (btrfs_fs_closing(fs_info))
		return -EINTR;

3996 3997 3998 3999
	bg = btrfs_lookup_block_group(fs_info, group_start);
	if (!bg)
		return -ENOENT;

4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010
	/*
	 * Relocation of a data block group creates ordered extents.  Without
	 * sb_start_write(), we can freeze the filesystem while unfinished
	 * ordered extents are left. Such ordered extents can cause a deadlock
	 * e.g. when syncfs() is waiting for their completion but they can't
	 * finish because they block when joining a transaction, due to the
	 * fact that the freeze locks are being held in write mode.
	 */
	if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
		ASSERT(sb_write_started(fs_info->sb));

4011 4012 4013 4014 4015
	if (btrfs_pinned_by_swapfile(fs_info, bg)) {
		btrfs_put_block_group(bg);
		return -ETXTBSY;
	}

4016
	rc = alloc_reloc_control(fs_info);
4017 4018
	if (!rc) {
		btrfs_put_block_group(bg);
4019
		return -ENOMEM;
4020
	}
4021

4022 4023 4024 4025 4026 4027
	ret = reloc_chunk_start(fs_info);
	if (ret < 0) {
		err = ret;
		goto out_put_bg;
	}

4028
	rc->extent_root = extent_root;
4029
	rc->block_group = bg;
4030

4031
	ret = btrfs_inc_block_group_ro(rc->block_group, true);
4032 4033 4034
	if (ret) {
		err = ret;
		goto out;
4035
	}
4036
	rw = 1;
4037

4038 4039 4040 4041 4042 4043
	path = btrfs_alloc_path();
	if (!path) {
		err = -ENOMEM;
		goto out;
	}

4044
	inode = lookup_free_space_inode(rc->block_group, path);
4045 4046 4047
	btrfs_free_path(path);

	if (!IS_ERR(inode))
4048
		ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4049 4050 4051 4052 4053 4054 4055 4056
	else
		ret = PTR_ERR(inode);

	if (ret && ret != -ENOENT) {
		err = ret;
		goto out;
	}

4057 4058 4059 4060 4061 4062 4063
	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
	if (IS_ERR(rc->data_inode)) {
		err = PTR_ERR(rc->data_inode);
		rc->data_inode = NULL;
		goto out;
	}

4064
	describe_relocation(fs_info, rc->block_group);
4065

4066
	btrfs_wait_block_group_reservations(rc->block_group);
4067
	btrfs_wait_nocow_writers(rc->block_group);
4068
	btrfs_wait_ordered_roots(fs_info, U64_MAX,
4069 4070
				 rc->block_group->start,
				 rc->block_group->length);
4071

4072 4073 4074
	ret = btrfs_zone_finish(rc->block_group);
	WARN_ON(ret && ret != -EAGAIN);

4075
	while (1) {
4076 4077
		int finishes_stage;

4078
		mutex_lock(&fs_info->cleaner_mutex);
4079
		ret = relocate_block_group(rc);
4080
		mutex_unlock(&fs_info->cleaner_mutex);
4081
		if (ret < 0)
4082 4083
			err = ret;

4084
		finishes_stage = rc->stage;
4085 4086 4087 4088 4089 4090 4091 4092 4093
		/*
		 * We may have gotten ENOSPC after we already dirtied some
		 * extents.  If writeout happens while we're relocating a
		 * different block group we could end up hitting the
		 * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
		 * btrfs_reloc_cow_block.  Make sure we write everything out
		 * properly so we don't trip over this problem, and then break
		 * out of the loop if we hit an error.
		 */
4094
		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4095 4096
			ret = btrfs_wait_ordered_range(rc->data_inode, 0,
						       (u64)-1);
4097
			if (ret)
4098
				err = ret;
4099 4100 4101 4102
			invalidate_mapping_pages(rc->data_inode->i_mapping,
						 0, -1);
			rc->stage = UPDATE_DATA_PTRS;
		}
4103 4104 4105 4106 4107 4108 4109

		if (err < 0)
			goto out;

		if (rc->extents_found == 0)
			break;

4110 4111
		btrfs_info(fs_info, "found %llu extents, stage: %s",
			   rc->extents_found, stage_to_string(finishes_stage));
4112 4113 4114 4115
	}

	WARN_ON(rc->block_group->pinned > 0);
	WARN_ON(rc->block_group->reserved > 0);
4116
	WARN_ON(rc->block_group->used > 0);
4117
out:
4118
	if (err && rw)
4119
		btrfs_dec_block_group_ro(rc->block_group);
4120
	iput(rc->data_inode);
4121 4122 4123
out_put_bg:
	btrfs_put_block_group(bg);
	reloc_chunk_end(fs_info);
4124
	free_reloc_control(rc);
4125 4126 4127
	return err;
}

4128 4129
static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
{
4130
	struct btrfs_fs_info *fs_info = root->fs_info;
4131
	struct btrfs_trans_handle *trans;
4132
	int ret, err;
4133

4134
	trans = btrfs_start_transaction(fs_info->tree_root, 0);
4135 4136
	if (IS_ERR(trans))
		return PTR_ERR(trans);
4137 4138 4139

	memset(&root->root_item.drop_progress, 0,
		sizeof(root->root_item.drop_progress));
4140
	btrfs_set_root_drop_level(&root->root_item, 0);
4141
	btrfs_set_root_refs(&root->root_item, 0);
4142
	ret = btrfs_update_root(trans, fs_info->tree_root,
4143 4144
				&root->root_key, &root->root_item);

4145
	err = btrfs_end_transaction(trans);
4146 4147 4148
	if (err)
		return err;
	return ret;
4149 4150
}

4151 4152 4153 4154 4155 4156
/*
 * recover relocation interrupted by system crash.
 *
 * this function resumes merging reloc trees with corresponding fs trees.
 * this is important for keeping the sharing of tree blocks
 */
4157
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172
{
	LIST_HEAD(reloc_roots);
	struct btrfs_key key;
	struct btrfs_root *fs_root;
	struct btrfs_root *reloc_root;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct reloc_control *rc = NULL;
	struct btrfs_trans_handle *trans;
	int ret;
	int err = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
4173
	path->reada = READA_BACK;
4174 4175 4176 4177 4178 4179

	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;

	while (1) {
4180
		ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192
					path, 0, 0);
		if (ret < 0) {
			err = ret;
			goto out;
		}
		if (ret > 0) {
			if (path->slots[0] == 0)
				break;
			path->slots[0]--;
		}
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4193
		btrfs_release_path(path);
4194 4195 4196 4197 4198

		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
		    key.type != BTRFS_ROOT_ITEM_KEY)
			break;

4199
		reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
4200 4201 4202 4203 4204
		if (IS_ERR(reloc_root)) {
			err = PTR_ERR(reloc_root);
			goto out;
		}

4205
		set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
4206 4207 4208
		list_add(&reloc_root->root_list, &reloc_roots);

		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
D
David Sterba 已提交
4209 4210
			fs_root = btrfs_get_fs_root(fs_info,
					reloc_root->root_key.offset, false);
4211
			if (IS_ERR(fs_root)) {
4212 4213 4214 4215 4216
				ret = PTR_ERR(fs_root);
				if (ret != -ENOENT) {
					err = ret;
					goto out;
				}
4217 4218 4219 4220 4221
				ret = mark_garbage_root(reloc_root);
				if (ret < 0) {
					err = ret;
					goto out;
				}
4222
			} else {
4223
				btrfs_put_root(fs_root);
4224 4225 4226 4227 4228 4229 4230 4231
			}
		}

		if (key.offset == 0)
			break;

		key.offset--;
	}
4232
	btrfs_release_path(path);
4233 4234 4235 4236

	if (list_empty(&reloc_roots))
		goto out;

4237
	rc = alloc_reloc_control(fs_info);
4238 4239 4240 4241 4242
	if (!rc) {
		err = -ENOMEM;
		goto out;
	}

4243 4244 4245 4246 4247 4248
	ret = reloc_chunk_start(fs_info);
	if (ret < 0) {
		err = ret;
		goto out_end;
	}

4249
	rc->extent_root = btrfs_extent_root(fs_info, 0);
4250 4251 4252

	set_reloc_control(rc);

4253
	trans = btrfs_join_transaction(rc->extent_root);
4254 4255
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
4256
		goto out_unset;
4257
	}
4258 4259 4260

	rc->merge_reloc_tree = 1;

4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271
	while (!list_empty(&reloc_roots)) {
		reloc_root = list_entry(reloc_roots.next,
					struct btrfs_root, root_list);
		list_del(&reloc_root->root_list);

		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
			list_add_tail(&reloc_root->root_list,
				      &rc->reloc_roots);
			continue;
		}

D
David Sterba 已提交
4272 4273
		fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
					    false);
4274 4275
		if (IS_ERR(fs_root)) {
			err = PTR_ERR(fs_root);
4276
			list_add_tail(&reloc_root->root_list, &reloc_roots);
4277
			btrfs_end_transaction(trans);
4278
			goto out_unset;
4279
		}
4280

4281
		err = __add_reloc_root(reloc_root);
4282
		ASSERT(err != -EEXIST);
4283 4284 4285 4286 4287 4288
		if (err) {
			list_add_tail(&reloc_root->root_list, &reloc_roots);
			btrfs_put_root(fs_root);
			btrfs_end_transaction(trans);
			goto out_unset;
		}
4289
		fs_root->reloc_root = btrfs_grab_root(reloc_root);
4290
		btrfs_put_root(fs_root);
4291 4292
	}

4293
	err = btrfs_commit_transaction(trans);
4294
	if (err)
4295
		goto out_unset;
4296 4297 4298 4299 4300

	merge_reloc_roots(rc);

	unset_reloc_control(rc);

4301
	trans = btrfs_join_transaction(rc->extent_root);
4302
	if (IS_ERR(trans)) {
4303
		err = PTR_ERR(trans);
4304
		goto out_clean;
4305
	}
4306
	err = btrfs_commit_transaction(trans);
4307
out_clean:
4308 4309 4310
	ret = clean_dirty_subvols(rc);
	if (ret < 0 && !err)
		err = ret;
4311 4312
out_unset:
	unset_reloc_control(rc);
4313 4314
out_end:
	reloc_chunk_end(fs_info);
4315
	free_reloc_control(rc);
4316
out:
4317
	free_reloc_roots(&reloc_roots);
4318

4319 4320 4321 4322
	btrfs_free_path(path);

	if (err == 0) {
		/* cleanup orphan inode in data relocation tree */
4323 4324 4325 4326
		fs_root = btrfs_grab_root(fs_info->data_reloc_root);
		ASSERT(fs_root);
		err = btrfs_orphan_cleanup(fs_root);
		btrfs_put_root(fs_root);
4327 4328 4329 4330 4331 4332 4333 4334 4335 4336
	}
	return err;
}

/*
 * helper to add ordered checksum for data relocation.
 *
 * cloning checksum properly handles the nodatasum extents.
 * it also saves CPU time to re-calculate the checksum.
 */
4337
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
4338
{
4339
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4340
	struct btrfs_root *csum_root;
4341 4342 4343 4344
	struct btrfs_ordered_sum *sums;
	struct btrfs_ordered_extent *ordered;
	int ret;
	u64 disk_bytenr;
4345
	u64 new_bytenr;
4346 4347
	LIST_HEAD(list);

4348
	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
4349
	BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len);
4350

4351
	disk_bytenr = file_pos + inode->index_cnt;
4352 4353
	csum_root = btrfs_csum_root(fs_info, disk_bytenr);
	ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
4354
				       disk_bytenr + len - 1, &list, 0, false);
4355 4356
	if (ret)
		goto out;
4357 4358 4359 4360 4361

	while (!list_empty(&list)) {
		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
		list_del_init(&sums->list);

4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373
		/*
		 * We need to offset the new_bytenr based on where the csum is.
		 * We need to do this because we will read in entire prealloc
		 * extents but we may have written to say the middle of the
		 * prealloc extent, so we need to make sure the csum goes with
		 * the right disk offset.
		 *
		 * We can do this because the data reloc inode refers strictly
		 * to the on disk bytes, so we don't have to worry about
		 * disk_len vs real len like with real inodes since it's all
		 * disk length.
		 */
4374
		new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr;
4375
		sums->bytenr = new_bytenr;
4376

4377
		btrfs_add_ordered_sum(ordered, sums);
4378
	}
4379
out:
4380
	btrfs_put_ordered_extent(ordered);
4381
	return ret;
4382
}
4383

4384 4385 4386
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *buf,
			  struct extent_buffer *cow)
4387
{
4388
	struct btrfs_fs_info *fs_info = root->fs_info;
4389
	struct reloc_control *rc;
4390
	struct btrfs_backref_node *node;
4391 4392
	int first_cow = 0;
	int level;
4393
	int ret = 0;
4394

4395
	rc = fs_info->reloc_ctl;
4396
	if (!rc)
4397
		return 0;
4398

4399
	BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413

	level = btrfs_header_level(buf);
	if (btrfs_header_generation(buf) <=
	    btrfs_root_last_snapshot(&root->root_item))
		first_cow = 1;

	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
	    rc->create_reloc_tree) {
		WARN_ON(!first_cow && level == 0);

		node = rc->backref_cache.path[level];
		BUG_ON(node->bytenr != buf->start &&
		       node->new_bytenr != buf->start);

4414
		btrfs_backref_drop_node_buffer(node);
D
David Sterba 已提交
4415
		atomic_inc(&cow->refs);
4416 4417 4418 4419 4420 4421 4422 4423 4424 4425
		node->eb = cow;
		node->new_bytenr = cow->start;

		if (!node->pending) {
			list_move_tail(&node->list,
				       &rc->backref_cache.pending[level]);
			node->pending = 1;
		}

		if (first_cow)
4426
			mark_block_processed(rc, node);
4427 4428 4429 4430 4431

		if (first_cow && level > 0)
			rc->nodes_relocated += buf->len;
	}

4432
	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4433
		ret = replace_file_extents(trans, rc, root, cow);
4434
	return ret;
4435 4436 4437 4438
}

/*
 * called before creating snapshot. it calculates metadata reservation
4439
 * required for relocating tree blocks in the snapshot
4440
 */
4441
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4442 4443
			      u64 *bytes_to_reserve)
{
4444 4445
	struct btrfs_root *root = pending->root;
	struct reloc_control *rc = root->fs_info->reloc_ctl;
4446

4447
	if (!rc || !have_reloc_root(root))
4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
		return;

	if (!rc->merge_reloc_tree)
		return;

	root = root->reloc_root;
	BUG_ON(btrfs_root_refs(&root->root_item) == 0);
	/*
	 * relocation is in the stage of merging trees. the space
	 * used by merging a reloc tree is twice the size of
	 * relocated tree nodes in the worst case. half for cowing
	 * the reloc tree, half for cowing the fs tree. the space
	 * used by cowing the reloc tree will be freed after the
	 * tree is dropped. if we create snapshot, cowing the fs
	 * tree may use more space than it frees. so we need
	 * reserve extra space.
	 */
	*bytes_to_reserve += rc->nodes_relocated;
}

/*
 * called after snapshot is created. migrate block reservation
 * and create reloc root for the newly created snapshot
4471 4472 4473 4474
 *
 * This is similar to btrfs_init_reloc_root(), we come out of here with two
 * references held on the reloc_root, one for root->reloc_root and one for
 * rc->reloc_roots.
4475
 */
4476
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4477 4478 4479 4480 4481
			       struct btrfs_pending_snapshot *pending)
{
	struct btrfs_root *root = pending->root;
	struct btrfs_root *reloc_root;
	struct btrfs_root *new_root;
4482
	struct reloc_control *rc = root->fs_info->reloc_ctl;
4483 4484
	int ret;

4485
	if (!rc || !have_reloc_root(root))
4486
		return 0;
4487 4488 4489 4490 4491 4492 4493

	rc = root->fs_info->reloc_ctl;
	rc->merging_rsv_size += rc->nodes_relocated;

	if (rc->merge_reloc_tree) {
		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
					      rc->block_rsv,
4494
					      rc->nodes_relocated, true);
4495 4496
		if (ret)
			return ret;
4497 4498 4499 4500 4501
	}

	new_root = pending->snap;
	reloc_root = create_reloc_root(trans, root->reloc_root,
				       new_root->root_key.objectid);
4502 4503
	if (IS_ERR(reloc_root))
		return PTR_ERR(reloc_root);
4504

4505
	ret = __add_reloc_root(reloc_root);
4506
	ASSERT(ret != -EEXIST);
4507 4508 4509 4510 4511
	if (ret) {
		/* Pairs with create_reloc_root */
		btrfs_put_root(reloc_root);
		return ret;
	}
4512
	new_root->reloc_root = btrfs_grab_root(reloc_root);
4513

4514
	if (rc->create_reloc_tree)
4515
		ret = clone_backref_node(trans, rc, root, reloc_root);
4516
	return ret;
4517
}