extent-tree.c 166.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "misc.h"
20
#include "tree-log.h"
21 22
#include "disk-io.h"
#include "print-tree.h"
23
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "locking.h"
26
#include "free-space-cache.h"
27
#include "free-space-tree.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35
#include "discard.h"
36
#include "rcu-string.h"
37
#include "zoned.h"
38
#include "dev-replace.h"
39

40 41
#undef SCRAMBLE_DELAYED_REFS

42

43
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
44 45 46 47
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
48 49 50 51 52 53 54 55
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56
				     struct btrfs_delayed_ref_node *node,
57
				     struct btrfs_delayed_extent_op *extent_op);
58 59
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
60

61
static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
J
Josef Bacik 已提交
62 63 64 65
{
	return (cache->flags & bits) == bits;
}

66 67
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
68
{
69
	u64 end = start + num_bytes - 1;
70 71
	set_extent_bits(&fs_info->excluded_extents, start, end,
			EXTENT_UPTODATE);
72 73
	return 0;
}
J
Josef Bacik 已提交
74

75
void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
76
{
77
	struct btrfs_fs_info *fs_info = cache->fs_info;
78
	u64 start, end;
J
Josef Bacik 已提交
79

80 81
	start = cache->start;
	end = start + cache->length - 1;
82

83 84
	clear_extent_bits(&fs_info->excluded_extents, start, end,
			  EXTENT_UPTODATE);
J
Josef Bacik 已提交
85 86
}

87
/* simple helper to search for an existing data extent at a given offset */
88
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
89
{
90
	struct btrfs_root *root = btrfs_extent_root(fs_info, start);
91 92
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
93
	struct btrfs_path *path;
94

Z
Zheng Yan 已提交
95
	path = btrfs_alloc_path();
96 97 98
	if (!path)
		return -ENOMEM;

99 100
	key.objectid = start;
	key.offset = len;
101
	key.type = BTRFS_EXTENT_ITEM_KEY;
102
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
Z
Zheng Yan 已提交
103
	btrfs_free_path(path);
104 105 106
	return ret;
}

107
/*
108
 * helper function to lookup reference count and flags of a tree block.
109 110 111 112 113 114 115 116
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
117
			     struct btrfs_fs_info *fs_info, u64 bytenr,
118
			     u64 offset, int metadata, u64 *refs, u64 *flags)
119
{
120
	struct btrfs_root *extent_root;
121 122 123 124 125 126 127 128 129 130 131
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

132 133 134 135
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
136 137
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
138 139 140
		metadata = 0;
	}

141 142 143 144 145 146 147 148
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
149 150 151 152 153 154 155 156 157

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

158 159
	extent_root = btrfs_extent_root(fs_info, bytenr);
	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
160 161 162
	if (ret < 0)
		goto out_free;

163
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
164 165 166 167 168 169
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
170
			    key.offset == fs_info->nodesize)
171 172
				ret = 0;
		}
173 174
	}

175 176
	if (ret == 0) {
		leaf = path->nodes[0];
177
		item_size = btrfs_item_size(leaf, path->slots[0]);
178 179 180 181 182 183
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
184 185 186 187 188 189 190 191
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
192
		}
193

194 195 196 197 198 199 200 201 202 203 204 205
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
206
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
207 208
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
209
			refcount_inc(&head->refs);
210 211
			spin_unlock(&delayed_refs->lock);

212
			btrfs_release_path(path);
213

214 215 216 217
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
218 219
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
220
			btrfs_put_delayed_ref_head(head);
221
			goto search_again;
222
		}
223
		spin_lock(&head->lock);
224 225 226 227 228
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

229
		num_refs += head->ref_mod;
230
		spin_unlock(&head->lock);
231 232 233 234 235 236 237 238 239 240 241 242 243 244
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

245 246 247 248 249 250 251 252 253 254 255 256 257 258
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
277
 * When a tree block is COWed through a tree, there are four cases:
278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
304 305 306
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
307
 * - different files inside a single subvolume
308 309
 * - different offsets inside a file (bookend extents in file.c)
 *
310
 * The extent ref structure for the implicit back refs has fields for:
311 312 313
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
314 315
 * - original offset in the file
 * - how many bookend extents
316
 *
317 318
 * The key offset for the implicit back refs is hash of the first
 * three fields.
319
 *
320
 * The extent ref structure for the full back refs has field for:
321
 *
322
 * - number of pointers in the tree leaf
323
 *
324 325
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
326
 *
327 328
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
329
 *
330
 *     (root_key.objectid, inode objectid, offset in file, 1)
331
 *
332 333
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
334
 *
335
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
336
 *
337
 * Btree extents can be referenced by:
338
 *
339
 * - Different subvolumes
340
 *
341 342 343 344
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
345
 *
346 347 348
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
349
 */
Z
Zheng Yan 已提交
350

351 352
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
353
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
354 355 356 357 358 359 360
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
361
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
362 363 364 365 366 367

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
368
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
369
				return type;
370 371 372
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
373 374
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
375 376
				 */
				if (offset &&
377
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
378 379
					return type;
			}
380
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
381
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
382
				return type;
383 384 385
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
386 387
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
388 389
				 */
				if (offset &&
390
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
391 392
					return type;
			}
393 394 395 396 397 398 399
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
400 401 402
	btrfs_err(eb->fs_info,
		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
		  eb->start, (unsigned long)iref, type);
403 404 405 406 407
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

408
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
409 410 411 412 413 414
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
415
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
416
	lenum = cpu_to_le64(owner);
417
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
418
	lenum = cpu_to_le64(offset);
419
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
449
	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
450 451
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
452
	struct extent_buffer *leaf;
453
	u32 nritems;
454
	int ret;
455 456
	int recow;
	int err = -ENOENT;
457

Z
Zheng Yan 已提交
458
	key.objectid = bytenr;
459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
474

475 476 477 478
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
479 480 481
	}

	leaf = path->nodes[0];
482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
507
				btrfs_release_path(path);
508 509 510 511 512 513
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
514
	}
515 516
fail:
	return err;
Z
Zheng Yan 已提交
517 518
}

519 520 521 522 523
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
524
{
525
	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
Z
Zheng Yan 已提交
526 527
	struct btrfs_key key;
	struct extent_buffer *leaf;
528
	u32 size;
Z
Zheng Yan 已提交
529 530
	u32 num_refs;
	int ret;
531 532

	key.objectid = bytenr;
533 534 535 536 537 538 539 540 541 542
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
543

544 545 546 547 548 549 550
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
551
		ref = btrfs_item_ptr(leaf, path->slots[0],
552 553 554 555 556 557 558
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
559
		}
560 561 562 563 564 565 566 567
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
568
			btrfs_release_path(path);
569 570 571 572 573
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
574

575 576 577 578 579 580 581 582 583 584 585 586 587 588
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
589 590
		}
	}
591 592 593
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
594
	btrfs_release_path(path);
595
	return ret;
596 597
}

598
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
599
					   struct btrfs_root *root,
600
					   struct btrfs_path *path,
J
Josef Bacik 已提交
601
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
602
{
603 604 605
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
606
	struct extent_buffer *leaf;
607
	u32 num_refs = 0;
Z
Zheng Yan 已提交
608 609 610
	int ret = 0;

	leaf = path->nodes[0];
611 612 613 614 615 616 617 618 619 620
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
621
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
622 623 624
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
625 626 627 628
	} else {
		BUG();
	}

629 630
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
631

Z
Zheng Yan 已提交
632
	if (num_refs == 0) {
633
		ret = btrfs_del_item(trans, root, path);
J
Josef Bacik 已提交
634
		*last_ref = 1;
Z
Zheng Yan 已提交
635
	} else {
636 637 638 639
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
640 641 642 643 644
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

645
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
646
					  struct btrfs_extent_inline_ref *iref)
647
{
648 649 650 651 652
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
653
	int type;
654 655 656

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
657 658

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
659
	if (iref) {
660 661 662 663 664 665 666
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
686

687 688 689 690
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
691
{
692
	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
693
	struct btrfs_key key;
694 695
	int ret;

696 697 698 699 700 701 702
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
703 704
	}

705 706 707 708
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
709 710
}

711 712 713 714
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
715
{
716
	struct btrfs_root *root = btrfs_extent_root(trans->fs_info, bytenr);
717
	struct btrfs_key key;
Z
Zheng Yan 已提交
718 719
	int ret;

720 721 722 723 724 725 726 727 728
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

729
	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
730
	btrfs_release_path(path);
Z
Zheng Yan 已提交
731 732 733
	return ret;
}

734
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
735
{
736 737 738 739 740 741 742 743 744 745 746 747 748
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
749
}
750

751 752
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
753

C
Chris Mason 已提交
754
{
755
	for (; level < BTRFS_MAX_LEVEL; level++) {
756 757 758 759 760 761 762 763 764 765 766 767 768 769 770
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
771

772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
793
	struct btrfs_fs_info *fs_info = trans->fs_info;
794
	struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr);
795 796 797 798 799 800 801 802 803 804 805 806 807
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
808
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
809
	int needed;
810

811
	key.objectid = bytenr;
Z
Zheng Yan 已提交
812
	key.type = BTRFS_EXTENT_ITEM_KEY;
813
	key.offset = num_bytes;
Z
Zheng Yan 已提交
814

815 816 817
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
818
		path->search_for_extension = 1;
819
		path->keep_locks = 1;
820 821
	} else
		extra_size = -1;
822 823

	/*
824 825
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
826 827 828 829 830 831 832
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
833
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
834
	if (ret < 0) {
835 836 837
		err = ret;
		goto out;
	}
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
855
			key.objectid = bytenr;
856 857 858 859 860 861 862
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

863 864 865
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
866
	} else if (WARN_ON(ret)) {
867 868
		err = -EIO;
		goto out;
869
	}
870 871

	leaf = path->nodes[0];
872
	item_size = btrfs_item_size(leaf, path->slots[0]);
873
	if (unlikely(item_size < sizeof(*ei))) {
874 875 876 877 878
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
879 880 881 882 883 884 885

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

886
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
887 888 889 890
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

891 892 893 894 895
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

896 897 898 899 900 901 902
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
903 904
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
905
			err = -EUCLEAN;
906 907 908
			goto out;
		}

909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
960 961
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
962
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
963 964 965 966 967 968
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
969
	if (insert) {
970
		path->keep_locks = 0;
971
		path->search_for_extension = 0;
972 973 974 975 976 977 978 979 980
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
981
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
982 983 984 985 986
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1004
	btrfs_extend_item(path, size);
1005 1006 1007 1008 1009 1010 1011 1012 1013

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
1014
	end = (unsigned long)ei + btrfs_item_size(leaf, path->slots[0]);
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1049 1050 1051
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1052
	if (ret != -ENOENT)
1053
		return ret;
1054

1055
	btrfs_release_path(path);
1056 1057 1058
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1059 1060
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1061
	} else {
1062 1063
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1064
	}
1065 1066
	return ret;
}
Z
Zheng Yan 已提交
1067

1068 1069 1070 1071
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1072
void update_inline_extent_backref(struct btrfs_path *path,
1073 1074
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1075 1076
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1077
{
1078
	struct extent_buffer *leaf = path->nodes[0];
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1097 1098 1099 1100 1101 1102
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1113
	}
Z
Zheng Yan 已提交
1114

1115 1116 1117 1118 1119 1120 1121 1122 1123
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1124
		*last_ref = 1;
1125
		size =  btrfs_extent_inline_ref_size(type);
1126
		item_size = btrfs_item_size(leaf, path->slots[0]);
1127 1128 1129 1130 1131 1132
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1133
		btrfs_truncate_item(path, item_size, 1);
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1149 1150 1151
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1152
	if (ret == 0) {
1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168
		/*
		 * We're adding refs to a tree block we already own, this
		 * should not happen at all.
		 */
		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
			btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
				   bytenr, num_bytes, root_objectid);
			if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
				WARN_ON(1);
				btrfs_crit(trans->fs_info,
			"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
				btrfs_print_leaf(path->nodes[0]);
			}
			return -EUCLEAN;
		}
1169 1170
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1171
	} else if (ret == -ENOENT) {
1172
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1173 1174 1175
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1176
	}
1177 1178
	return ret;
}
Z
Zheng Yan 已提交
1179

1180
static int remove_extent_backref(struct btrfs_trans_handle *trans,
1181
				 struct btrfs_root *root,
1182 1183
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1184
				 int refs_to_drop, int is_data, int *last_ref)
1185
{
1186
	int ret = 0;
1187

1188 1189
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1190 1191
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1192
	} else if (is_data) {
1193
		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
J
Josef Bacik 已提交
1194
					     last_ref);
1195
	} else {
J
Josef Bacik 已提交
1196
		*last_ref = 1;
1197
		ret = btrfs_del_item(trans, root, path);
1198 1199 1200 1201
	}
	return ret;
}

1202 1203
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1204
{
1205 1206
	int j, ret = 0;
	u64 bytes_left, end;
1207
	u64 aligned_start = ALIGN(start, 1 << 9);
1208

1209 1210 1211 1212 1213
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1214

1215
	*discarded_bytes = 0;
1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1267 1268
					   GFP_NOFS, 0);
		if (!ret)
1269
			*discarded_bytes += bytes_left;
1270
	}
1271
	return ret;
1272 1273
}

1274
static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes)
1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
{
	struct btrfs_device *dev = stripe->dev;
	struct btrfs_fs_info *fs_info = dev->fs_info;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	u64 phys = stripe->physical;
	u64 len = stripe->length;
	u64 discarded = 0;
	int ret = 0;

	/* Zone reset on a zoned filesystem */
	if (btrfs_can_zone_reset(dev, phys, len)) {
		u64 src_disc;

		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
		if (ret)
			goto out;

		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
		    dev != dev_replace->srcdev)
			goto out;

		src_disc = discarded;

		/* Send to replace target as well */
		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
					      &discarded);
		discarded += src_disc;
	} else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
	} else {
		ret = 0;
		*bytes = 0;
	}

out:
	*bytes = discarded;
	return ret;
}

1314
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1315
			 u64 num_bytes, u64 *actual_bytes)
1316
{
1317
	int ret = 0;
1318
	u64 discarded_bytes = 0;
1319 1320
	u64 end = bytenr + num_bytes;
	u64 cur = bytenr;
1321
	struct btrfs_io_context *bioc = NULL;
C
Christoph Hellwig 已提交
1322

1323
	/*
1324
	 * Avoid races with device replace and make sure our bioc has devices
1325 1326
	 * associated to its stripes that don't go away while we are discarding.
	 */
1327
	btrfs_bio_counter_inc_blocked(fs_info);
1328
	while (cur < end) {
1329
		struct btrfs_io_stripe *stripe;
1330 1331
		int i;

1332 1333 1334
		num_bytes = end - cur;
		/* Tell the block device(s) that the sectors can be discarded */
		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1335
				      &num_bytes, &bioc, 0);
1336 1337 1338 1339 1340 1341 1342
		/*
		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
		 * thus we can't continue anyway.
		 */
		if (ret < 0)
			goto out;
1343

1344 1345
		stripe = bioc->stripes;
		for (i = 0; i < bioc->num_stripes; i++, stripe++) {
1346
			u64 bytes;
1347
			struct btrfs_device *device = stripe->dev;
1348

1349
			if (!device->bdev) {
1350 1351 1352
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1353

1354 1355 1356
			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				continue;

1357
			ret = do_discard_extent(stripe, &bytes);
1358
			if (!ret) {
1359
				discarded_bytes += bytes;
1360 1361 1362 1363 1364 1365 1366 1367
			} else if (ret != -EOPNOTSUPP) {
				/*
				 * Logic errors or -ENOMEM, or -EIO, but
				 * unlikely to happen.
				 *
				 * And since there are two loops, explicitly
				 * go to out to avoid confusion.
				 */
1368
				btrfs_put_bioc(bioc);
1369 1370
				goto out;
			}
1371 1372 1373 1374 1375 1376 1377

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1378
		}
1379
		btrfs_put_bioc(bioc);
1380
		cur += num_bytes;
1381
	}
1382
out:
1383
	btrfs_bio_counter_dec(fs_info);
1384 1385 1386 1387

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1388

D
David Woodhouse 已提交
1389 1390
	if (ret == -EOPNOTSUPP)
		ret = 0;
1391 1392 1393
	return ret;
}

1394
/* Can return -ENOMEM */
1395
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1396
			 struct btrfs_ref *generic_ref)
1397
{
1398
	struct btrfs_fs_info *fs_info = trans->fs_info;
1399
	int ret;
A
Arne Jansen 已提交
1400

1401 1402 1403
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1404
	       generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID);
1405

1406
	if (generic_ref->type == BTRFS_REF_METADATA)
1407
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1408
	else
1409
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1410

1411
	btrfs_ref_tree_mod(fs_info, generic_ref);
1412

1413 1414 1415
	return ret;
}

1416 1417 1418
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
1419 1420 1421
 * The counterpart is in __btrfs_free_extent(), with examples and more details
 * how it works.
 *
1422 1423 1424 1425 1426 1427 1428 1429 1430 1431
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
D
David Sterba 已提交
1432
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otherwise, parent must
1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1453
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1454
				  struct btrfs_delayed_ref_node *node,
1455 1456 1457 1458 1459 1460 1461
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1462
	struct btrfs_key key;
1463 1464
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1465 1466 1467 1468 1469 1470 1471 1472
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* this will setup the path even if it fails to insert the back ref */
1473 1474 1475
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1476
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1477
		goto out;
J
Josef Bacik 已提交
1478 1479 1480 1481 1482 1483

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1484
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1485
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1486 1487 1488 1489 1490
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1491

1492
	btrfs_mark_buffer_dirty(leaf);
1493
	btrfs_release_path(path);
1494 1495

	/* now insert the actual backref */
1496 1497 1498 1499 1500 1501 1502 1503 1504
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
	} else {
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
	}
1505
	if (ret)
1506
		btrfs_abort_transaction(trans, ret);
1507
out:
1508
	btrfs_free_path(path);
1509
	return ret;
1510 1511
}

1512 1513 1514 1515
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1516
{
1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1529
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1530

1531 1532
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1533
	ref_root = ref->root;
1534 1535

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1536
		if (extent_op)
1537
			flags |= extent_op->flags_to_set;
1538 1539 1540 1541
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1542
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1543 1544 1545
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1546
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1547
		ret = __btrfs_free_extent(trans, node, parent,
1548 1549
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1550
					  extent_op);
1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1576
				 struct btrfs_delayed_ref_head *head,
1577 1578
				 struct btrfs_delayed_extent_op *extent_op)
{
1579
	struct btrfs_fs_info *fs_info = trans->fs_info;
1580
	struct btrfs_root *root;
1581 1582 1583 1584 1585
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1586
	int ret;
1587
	int err = 0;
1588
	int metadata = !extent_op->is_data;
1589

1590
	if (TRANS_ABORTED(trans))
1591 1592
		return 0;

1593
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1594 1595
		metadata = 0;

1596 1597 1598 1599
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1600
	key.objectid = head->bytenr;
1601

1602 1603
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1604
		key.offset = extent_op->level;
1605 1606
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1607
		key.offset = head->num_bytes;
1608 1609
	}

1610
	root = btrfs_extent_root(fs_info, key.objectid);
1611
again:
1612
	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
1613 1614 1615 1616 1617
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1618
		if (metadata) {
1619 1620 1621 1622
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1623
				if (key.objectid == head->bytenr &&
1624
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1625
				    key.offset == head->num_bytes)
1626 1627 1628 1629 1630
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1631

1632 1633
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1634 1635 1636 1637 1638 1639
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1640
		}
1641 1642 1643
	}

	leaf = path->nodes[0];
1644
	item_size = btrfs_item_size(leaf, path->slots[0]);
1645

1646
	if (unlikely(item_size < sizeof(*ei))) {
1647 1648 1649 1650 1651 1652
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1653 1654
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1655

1656 1657 1658 1659
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1660 1661
}

1662 1663 1664 1665
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1666 1667
{
	int ret = 0;
1668 1669 1670
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1671

1672
	ref = btrfs_delayed_node_to_tree_ref(node);
1673
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1674

1675 1676
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1677
	ref_root = ref->root;
1678

1679
	if (node->ref_mod != 1) {
1680
		btrfs_err(trans->fs_info,
1681 1682 1683 1684 1685
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1686
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1687
		BUG_ON(!extent_op || !extent_op->update_flags);
1688
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1689
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1690 1691
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1692
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1693
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1694
					  ref->level, 0, 1, extent_op);
1695 1696 1697
	} else {
		BUG();
	}
1698 1699 1700 1701
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1702 1703 1704 1705
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1706
{
1707 1708
	int ret = 0;

1709
	if (TRANS_ABORTED(trans)) {
1710
		if (insert_reserved)
1711
			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1712
		return 0;
1713
	}
1714

1715 1716
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1717
		ret = run_delayed_tree_ref(trans, node, extent_op,
1718 1719 1720
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1721
		ret = run_delayed_data_ref(trans, node, extent_op,
1722 1723 1724
					   insert_reserved);
	else
		BUG();
1725
	if (ret && insert_reserved)
1726
		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1727
	return ret;
1728 1729
}

1730
static inline struct btrfs_delayed_ref_node *
1731 1732
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1733 1734
	struct btrfs_delayed_ref_node *ref;

1735
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1736
		return NULL;
1737

1738 1739 1740 1741 1742 1743
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1744 1745 1746 1747
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1748
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1749
		       struct btrfs_delayed_ref_node, ref_node);
1750 1751
	ASSERT(list_empty(&ref->add_list));
	return ref;
1752 1753
}

1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1764 1765
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1766 1767 1768 1769
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1770 1771
		return NULL;

1772
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1773
		head->extent_op = NULL;
1774
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1775
		return NULL;
1776
	}
J
Josef Bacik 已提交
1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1790
	spin_unlock(&head->lock);
1791
	ret = run_delayed_extent_op(trans, head, extent_op);
1792 1793 1794 1795
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1796 1797 1798
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1799
{
J
Josef Bacik 已提交
1800
	int nr_items = 1;	/* Dropping this ref head update. */
1801

1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812
	/*
	 * We had csum deletions accounted for in our delayed refs rsv, we need
	 * to drop the csum leaves for this update from our delayed_refs_rsv.
	 */
	if (head->total_ref_mod < 0 && head->is_data) {
		spin_lock(&delayed_refs->lock);
		delayed_refs->pending_csums -= head->num_bytes;
		spin_unlock(&delayed_refs->lock);
		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
	}

J
Josef Bacik 已提交
1813
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1814 1815
}

1816 1817 1818
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1819 1820

	struct btrfs_fs_info *fs_info = trans->fs_info;
1821 1822 1823 1824 1825
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1826
	ret = run_and_cleanup_extent_op(trans, head);
1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1842
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1843 1844 1845 1846
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1847
	btrfs_delete_ref_head(delayed_refs, head);
1848
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1849
	spin_unlock(&delayed_refs->lock);
1850 1851

	if (head->must_insert_reserved) {
1852
		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1853
		if (head->is_data) {
1854 1855 1856 1857 1858
			struct btrfs_root *csum_root;

			csum_root = btrfs_csum_root(fs_info, head->bytenr);
			ret = btrfs_del_csums(trans, csum_root, head->bytenr,
					      head->num_bytes);
1859 1860 1861
		}
	}

1862
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1863 1864

	trace_run_delayed_ref_head(fs_info, head, 0);
1865
	btrfs_delayed_ref_unlock(head);
1866
	btrfs_put_delayed_ref_head(head);
1867
	return ret;
1868 1869
}

1870 1871 1872 1873 1874 1875 1876 1877 1878
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1879
	head = btrfs_select_ref_head(delayed_refs);
1880 1881 1882 1883 1884 1885 1886 1887 1888
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1889
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1916 1917 1918
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1983 1984 1985 1986
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
1987 1988
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
1989
{
1990
	struct btrfs_fs_info *fs_info = trans->fs_info;
1991 1992
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
1993
	ktime_t start = ktime_get();
1994
	int ret;
1995
	unsigned long count = 0;
1996
	unsigned long actual_count = 0;
1997 1998

	delayed_refs = &trans->transaction->delayed_refs;
1999
	do {
2000
		if (!locked_ref) {
2001
			locked_ref = btrfs_obtain_ref_head(trans);
2002 2003 2004 2005 2006 2007
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2008
			}
2009
			count++;
2010
		}
2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2023
		spin_lock(&locked_ref->lock);
2024
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2025

2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2039
			ret = cleanup_ref_head(trans, locked_ref);
2040
			if (ret > 0 ) {
2041 2042
				/* We dropped our lock, we need to loop. */
				ret = 0;
2043
				continue;
2044 2045
			} else if (ret) {
				return ret;
2046
			}
2047
		}
2048

2049
		/*
2050 2051
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2052 2053
		 */

2054
		locked_ref = NULL;
2055
		cond_resched();
2056
	} while ((nr != -1 && count < nr) || locked_ref);
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2073
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2074 2075
		spin_unlock(&delayed_refs->lock);
	}
2076
	return 0;
2077 2078
}

2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2122 2123 2124 2125 2126 2127
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2128 2129 2130
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2131 2132
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2133
			   unsigned long count)
2134
{
2135
	struct btrfs_fs_info *fs_info = trans->fs_info;
2136 2137
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2138
	struct btrfs_delayed_ref_head *head;
2139 2140 2141
	int ret;
	int run_all = count == (unsigned long)-1;

2142
	/* We'll clean this up in btrfs_cleanup_transaction */
2143
	if (TRANS_ABORTED(trans))
2144 2145
		return 0;

2146
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2147 2148
		return 0;

2149
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2150
	if (count == 0)
2151
		count = delayed_refs->num_heads_ready;
2152

2153
again:
2154 2155 2156
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2157
	ret = __btrfs_run_delayed_refs(trans, count);
2158
	if (ret < 0) {
2159
		btrfs_abort_transaction(trans, ret);
2160
		return ret;
2161
	}
2162

2163
	if (run_all) {
2164
		btrfs_create_pending_block_groups(trans);
2165

2166
		spin_lock(&delayed_refs->lock);
2167
		node = rb_first_cached(&delayed_refs->href_root);
2168 2169
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2170
			goto out;
2171
		}
2172 2173 2174 2175
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2176

2177 2178 2179
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2180

2181
		btrfs_put_delayed_ref_head(head);
2182
		cond_resched();
2183
		goto again;
2184
	}
2185
out:
2186 2187 2188
	return 0;
}

2189
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2190
				struct extent_buffer *eb, u64 flags,
2191
				int level, int is_data)
2192 2193 2194 2195
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2196
	extent_op = btrfs_alloc_delayed_extent_op();
2197 2198 2199 2200
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2201 2202 2203
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2204
	extent_op->level = level;
2205

2206
	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2207
	if (ret)
2208
		btrfs_free_delayed_extent_op(extent_op);
2209 2210 2211
	return ret;
}

2212
static noinline int check_delayed_ref(struct btrfs_root *root,
2213 2214 2215 2216 2217 2218 2219
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2220
	struct btrfs_transaction *cur_trans;
2221
	struct rb_node *node;
2222 2223
	int ret = 0;

2224
	spin_lock(&root->fs_info->trans_lock);
2225
	cur_trans = root->fs_info->running_transaction;
2226 2227 2228
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2229 2230 2231 2232
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2233
	spin_lock(&delayed_refs->lock);
2234
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2235 2236
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2237
		btrfs_put_transaction(cur_trans);
2238 2239
		return 0;
	}
2240 2241

	if (!mutex_trylock(&head->mutex)) {
2242
		refcount_inc(&head->refs);
2243 2244
		spin_unlock(&delayed_refs->lock);

2245
		btrfs_release_path(path);
2246

2247 2248 2249 2250
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2251 2252
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2253
		btrfs_put_delayed_ref_head(head);
2254
		btrfs_put_transaction(cur_trans);
2255 2256
		return -EAGAIN;
	}
2257
	spin_unlock(&delayed_refs->lock);
2258

2259
	spin_lock(&head->lock);
2260 2261 2262 2263
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2264 2265
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2266
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2267 2268 2269 2270 2271
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2272

2273
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2274

2275 2276 2277 2278 2279 2280 2281 2282 2283 2284
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2285
	}
2286
	spin_unlock(&head->lock);
2287
	mutex_unlock(&head->mutex);
2288
	btrfs_put_transaction(cur_trans);
2289 2290 2291
	return ret;
}

2292
static noinline int check_committed_ref(struct btrfs_root *root,
2293
					struct btrfs_path *path,
2294 2295
					u64 objectid, u64 offset, u64 bytenr,
					bool strict)
2296
{
2297
	struct btrfs_fs_info *fs_info = root->fs_info;
2298
	struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bytenr);
2299
	struct extent_buffer *leaf;
2300 2301 2302
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2303
	struct btrfs_key key;
2304
	u32 item_size;
2305
	int type;
2306
	int ret;
2307

2308
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2309
	key.offset = (u64)-1;
2310
	key.type = BTRFS_EXTENT_ITEM_KEY;
2311 2312 2313 2314

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2315
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2316 2317 2318

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2319
		goto out;
2320

Z
Zheng Yan 已提交
2321
	path->slots[0]--;
2322
	leaf = path->nodes[0];
2323
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2324

2325
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2326
		goto out;
2327

2328
	ret = 1;
2329
	item_size = btrfs_item_size(leaf, path->slots[0]);
2330
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2331

2332
	/* If extent item has more than 1 inline ref then it's shared */
2333 2334 2335
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2336

2337 2338 2339 2340 2341 2342 2343
	/*
	 * If extent created before last snapshot => it's shared unless the
	 * snapshot has been deleted. Use the heuristic if strict is false.
	 */
	if (!strict &&
	    (btrfs_extent_generation(leaf, ei) <=
	     btrfs_root_last_snapshot(&root->root_item)))
2344 2345 2346
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2347

2348
	/* If this extent has SHARED_DATA_REF then it's shared */
2349 2350
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2367
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2368
			  u64 bytenr, bool strict)
2369 2370 2371 2372 2373 2374
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2375
		return -ENOMEM;
2376 2377

	do {
2378
		ret = check_committed_ref(root, path, objectid,
2379
					  offset, bytenr, strict);
2380
		if (ret && ret != -ENOENT)
2381
			goto out;
Y
Yan Zheng 已提交
2382

2383 2384
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2385

2386
out:
Y
Yan Zheng 已提交
2387
	btrfs_free_path(path);
2388
	if (btrfs_is_data_reloc_root(root))
2389
		WARN_ON(ret > 0);
2390
	return ret;
2391
}
C
Chris Mason 已提交
2392

2393
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2394
			   struct btrfs_root *root,
2395
			   struct extent_buffer *buf,
2396
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2397
{
2398
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2399
	u64 bytenr;
2400 2401
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2402 2403 2404 2405
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2406 2407
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2408
	int i;
2409
	int action;
Z
Zheng Yan 已提交
2410 2411
	int level;
	int ret = 0;
2412

2413
	if (btrfs_is_testing(fs_info))
2414
		return 0;
2415

Z
Zheng Yan 已提交
2416 2417 2418 2419
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2420
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2421
		return 0;
Z
Zheng Yan 已提交
2422

2423 2424 2425 2426
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2427 2428 2429 2430
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2431 2432

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2433
		if (level == 0) {
2434
			btrfs_item_key_to_cpu(buf, &key, i);
2435
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2436
				continue;
2437
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2438 2439 2440 2441 2442 2443 2444
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2445 2446 2447

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2448 2449 2450
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2451 2452
					    key.offset, root->root_key.objectid,
					    for_reloc);
2453
			if (inc)
2454
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2455
			else
2456
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2457 2458 2459
			if (ret)
				goto fail;
		} else {
2460
			bytenr = btrfs_node_blockptr(buf, i);
2461
			num_bytes = fs_info->nodesize;
2462 2463
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
2464 2465
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root,
					    root->root_key.objectid, for_reloc);
2466
			if (inc)
2467
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2468
			else
2469
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2470 2471 2472 2473 2474 2475
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2476 2477 2478 2479
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2480
		  struct extent_buffer *buf, int full_backref)
2481
{
2482
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2483 2484 2485
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2486
		  struct extent_buffer *buf, int full_backref)
2487
{
2488
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2489 2490
}

2491
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2492
{
2493
	struct btrfs_fs_info *fs_info = root->fs_info;
2494
	u64 flags;
D
David Woodhouse 已提交
2495
	u64 ret;
J
Josef Bacik 已提交
2496

2497 2498
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2499
	else if (root == fs_info->chunk_root)
2500
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2501
	else
2502
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2503

2504
	ret = btrfs_get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2505
	return ret;
J
Josef Bacik 已提交
2506
}
J
Josef Bacik 已提交
2507

2508
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2509
{
2510
	struct btrfs_block_group *cache;
2511
	u64 bytenr;
J
Josef Bacik 已提交
2512

2513 2514 2515
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2516 2517 2518 2519

	if (bytenr < (u64)-1)
		return bytenr;

2520
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2521
	if (!cache)
2522
		return 0;
J
Josef Bacik 已提交
2523

2524
	bytenr = cache->start;
2525
	btrfs_put_block_group(cache);
2526 2527

	return bytenr;
2528 2529
}

2530 2531
static int pin_down_extent(struct btrfs_trans_handle *trans,
			   struct btrfs_block_group *cache,
2532
			   u64 bytenr, u64 num_bytes, int reserved)
2533
{
2534 2535
	struct btrfs_fs_info *fs_info = cache->fs_info;

2536 2537 2538
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2539 2540
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2541 2542 2543 2544 2545 2546
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2547

2548
	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2549 2550 2551
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2552

2553
int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2554 2555
		     u64 bytenr, u64 num_bytes, int reserved)
{
2556
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
2557

2558
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2559
	BUG_ON(!cache); /* Logic error */
2560

2561
	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2562 2563

	btrfs_put_block_group(cache);
2564 2565 2566
	return 0;
}

2567
/*
2568 2569
 * this function must be called within transaction
 */
2570
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2571 2572
				    u64 bytenr, u64 num_bytes)
{
2573
	struct btrfs_block_group *cache;
2574
	int ret;
2575

2576
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2577 2578
	if (!cache)
		return -EINVAL;
2579 2580 2581 2582 2583 2584 2585

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2586
	btrfs_cache_block_group(cache, 1);
2587 2588 2589 2590 2591 2592 2593
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(cache);
	if (ret)
		goto out;
2594

2595
	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2596 2597

	/* remove us from the free space cache (if we're there at all) */
2598
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2599
out:
2600
	btrfs_put_block_group(cache);
2601
	return ret;
2602 2603
}

2604 2605
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2606 2607
{
	int ret;
2608
	struct btrfs_block_group *block_group;
2609

2610
	block_group = btrfs_lookup_block_group(fs_info, start);
2611 2612 2613
	if (!block_group)
		return -EINVAL;

2614 2615 2616 2617 2618 2619 2620 2621
	btrfs_cache_block_group(block_group, 1);
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(block_group);
	if (ret)
		goto out;
2622

2623 2624
	ret = btrfs_remove_free_space(block_group, start, num_bytes);
out:
2625 2626 2627 2628
	btrfs_put_block_group(block_group);
	return ret;
}

2629
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2630
{
2631
	struct btrfs_fs_info *fs_info = eb->fs_info;
2632 2633 2634 2635
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2636
	int ret = 0;
2637

2638
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2653 2654 2655
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2656 2657
	}

2658
	return ret;
2659 2660
}

2661
static void
2662
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2663 2664 2665 2666
{
	atomic_inc(&bg->reservations);
}

2667 2668 2669 2670 2671
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2672 2673
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2674 2675 2676 2677 2678 2679 2680 2681
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2682
		ret = &fs_info->meta_alloc_cluster;
2683 2684 2685
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2686
			*empty_cluster = SZ_64K;
2687 2688 2689
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2690
		ret = &fs_info->data_alloc_cluster;
2691 2692 2693 2694 2695
	}

	return ret;
}

2696 2697
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2698
			      const bool return_free_space)
C
Chris Mason 已提交
2699
{
2700
	struct btrfs_block_group *cache = NULL;
2701 2702
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2703
	struct btrfs_free_cluster *cluster = NULL;
2704
	u64 len;
2705 2706
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2707
	bool readonly;
C
Chris Mason 已提交
2708

2709
	while (start <= end) {
2710
		readonly = false;
2711
		if (!cache ||
2712
		    start >= cache->start + cache->length) {
2713 2714
			if (cache)
				btrfs_put_block_group(cache);
2715
			total_unpinned = 0;
2716
			cache = btrfs_lookup_block_group(fs_info, start);
2717
			BUG_ON(!cache); /* Logic error */
2718

2719
			cluster = fetch_cluster_info(fs_info,
2720 2721 2722
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2723 2724
		}

2725
		len = cache->start + cache->length - start;
2726 2727
		len = min(len, end + 1 - start);

2728
		down_read(&fs_info->commit_root_sem);
2729 2730 2731 2732
		if (start < cache->last_byte_to_unpin && return_free_space) {
			u64 add_len = min(len, cache->last_byte_to_unpin - start);

			btrfs_add_free_space(cache, start, add_len);
2733
		}
2734
		up_read(&fs_info->commit_root_sem);
2735

2736
		start += len;
2737
		total_unpinned += len;
2738
		space_info = cache->space_info;
2739

2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2753
		spin_lock(&space_info->lock);
2754 2755
		spin_lock(&cache->lock);
		cache->pinned -= len;
2756
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2757
		space_info->max_extent_size = 0;
2758 2759 2760
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
2761 2762 2763 2764
		} else if (btrfs_is_zoned(fs_info)) {
			/* Need reset before reusing in a zoned block group */
			space_info->bytes_zone_unusable += len;
			readonly = true;
2765
		}
2766
		spin_unlock(&cache->lock);
2767 2768 2769
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2770

2771 2772
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2773 2774 2775
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2776 2777
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2778 2779
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2780
				len -= to_add;
2781 2782 2783
			}
			spin_unlock(&global_rsv->lock);
		}
2784 2785 2786
		/* Add to any tickets we may have */
		if (!readonly && return_free_space && len)
			btrfs_try_granting_tickets(fs_info, space_info);
2787
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2788
	}
2789 2790 2791

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
2792 2793 2794
	return 0;
}

2795
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2796
{
2797
	struct btrfs_fs_info *fs_info = trans->fs_info;
2798
	struct btrfs_block_group *block_group, *tmp;
2799
	struct list_head *deleted_bgs;
2800
	struct extent_io_tree *unpin;
2801 2802
	u64 start;
	u64 end;
2803 2804
	int ret;

2805
	unpin = &trans->transaction->pinned_extents;
2806

2807
	while (!TRANS_ABORTED(trans)) {
2808 2809
		struct extent_state *cached_state = NULL;

2810
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2811
		ret = find_first_extent_bit(unpin, 0, &start, &end,
2812
					    EXTENT_DIRTY, &cached_state);
2813 2814
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2815
			break;
2816
		}
2817

2818
		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2819
			ret = btrfs_discard_extent(fs_info, start,
2820
						   end + 1 - start, NULL);
2821

2822
		clear_extent_dirty(unpin, start, end, &cached_state);
2823
		unpin_extent_range(fs_info, start, end, true);
2824
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2825
		free_extent_state(cached_state);
2826
		cond_resched();
2827
	}
J
Josef Bacik 已提交
2828

2829 2830
	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2831
		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2832
	}
2833

2834 2835 2836 2837 2838 2839 2840 2841 2842 2843
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
2844
		if (!TRANS_ABORTED(trans))
2845
			ret = btrfs_discard_extent(fs_info,
2846 2847
						   block_group->start,
						   block_group->length,
2848 2849 2850
						   &trimmed);

		list_del_init(&block_group->bg_list);
2851
		btrfs_unfreeze_block_group(block_group);
2852 2853 2854 2855 2856
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
2857
			   "discard failed while removing blockgroup: errno=%d %s",
2858 2859 2860 2861
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
2862 2863 2864
	return 0;
}

2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923
/*
 * Drop one or more refs of @node.
 *
 * 1. Locate the extent refs.
 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
 *    Locate it, then reduce the refs number or remove the ref line completely.
 *
 * 2. Update the refs count in EXTENT/METADATA_ITEM
 *
 * Inline backref case:
 *
 * in extent tree we have:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 2 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *		extent data backref root FS_TREE objectid 257 offset 0 count 1
 *
 * This function gets called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 257
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 1 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *
 * Keyed backref case:
 *
 * in extent tree we have:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 754 gen 6 flags DATA
 *	[...]
 *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
 *		extent data backref root FS_TREE objectid 866 offset 0 count 1
 *
 * This function get called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 866
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 753 gen 6 flags DATA
 *
 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
 */
2924
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2925 2926 2927 2928
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
2929
{
2930
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
2931
	struct btrfs_key key;
2932
	struct btrfs_path *path;
2933
	struct btrfs_root *extent_root;
2934
	struct extent_buffer *leaf;
2935 2936
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
2937
	int ret;
2938
	int is_data;
2939 2940 2941
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
2942 2943
	u32 item_size;
	u64 refs;
2944 2945
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
2946
	int last_ref = 0;
2947
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
2948

2949
	extent_root = btrfs_extent_root(info, bytenr);
2950
	ASSERT(extent_root);
2951

2952
	path = btrfs_alloc_path();
2953 2954
	if (!path)
		return -ENOMEM;
2955

2956
	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2957 2958 2959 2960 2961 2962 2963 2964 2965

	if (!is_data && refs_to_drop != 1) {
		btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
			   node->bytenr, refs_to_drop);
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
2966

2967
	if (is_data)
2968
		skinny_metadata = false;
2969

2970 2971
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
2972
				    owner_offset);
2973
	if (ret == 0) {
2974 2975 2976 2977 2978 2979 2980
		/*
		 * Either the inline backref or the SHARED_DATA_REF/
		 * SHARED_BLOCK_REF is found
		 *
		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
		 */
2981
		extent_slot = path->slots[0];
2982 2983
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
2984
					      extent_slot);
2985
			if (key.objectid != bytenr)
2986
				break;
2987 2988
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
2989 2990 2991
				found_extent = 1;
				break;
			}
2992 2993 2994 2995 2996
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
2997 2998

			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
2999 3000
			if (path->slots[0] - extent_slot > 5)
				break;
3001
			extent_slot--;
3002
		}
3003

Z
Zheng Yan 已提交
3004
		if (!found_extent) {
3005 3006 3007 3008 3009 3010 3011
			if (iref) {
				btrfs_crit(info,
"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
			/* Must be SHARED_* item, remove the backref first */
3012 3013 3014
			ret = remove_extent_backref(trans, extent_root, path,
						    NULL, refs_to_drop, is_data,
						    &last_ref);
3015
			if (ret) {
3016
				btrfs_abort_transaction(trans, ret);
3017 3018
				goto out;
			}
3019
			btrfs_release_path(path);
3020

3021
			/* Slow path to locate EXTENT/METADATA_ITEM */
3022 3023 3024 3025
			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3026 3027 3028 3029 3030
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3031 3032
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3049
				key.objectid = bytenr;
3050 3051 3052 3053 3054 3055 3056
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3057
			if (ret) {
J
Jeff Mahoney 已提交
3058 3059 3060
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3061
				if (ret > 0)
3062
					btrfs_print_leaf(path->nodes[0]);
3063
			}
3064
			if (ret < 0) {
3065
				btrfs_abort_transaction(trans, ret);
3066 3067
				goto out;
			}
Z
Zheng Yan 已提交
3068 3069
			extent_slot = path->slots[0];
		}
3070
	} else if (WARN_ON(ret == -ENOENT)) {
3071
		btrfs_print_leaf(path->nodes[0]);
3072 3073
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3074 3075
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3076
		btrfs_abort_transaction(trans, ret);
3077
		goto out;
3078
	} else {
3079
		btrfs_abort_transaction(trans, ret);
3080
		goto out;
3081
	}
3082 3083

	leaf = path->nodes[0];
3084
	item_size = btrfs_item_size(leaf, extent_slot);
3085
	if (unlikely(item_size < sizeof(*ei))) {
3086 3087 3088 3089 3090
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3091
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3092
			    struct btrfs_extent_item);
3093 3094
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3095
		struct btrfs_tree_block_info *bi;
3096 3097
		if (item_size < sizeof(*ei) + sizeof(*bi)) {
			btrfs_crit(info,
3098
"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3099 3100 3101 3102 3103 3104
				   key.objectid, key.type, key.offset,
				   owner_objectid, item_size,
				   sizeof(*ei) + sizeof(*bi));
			btrfs_abort_transaction(trans, -EUCLEAN);
			goto err_dump;
		}
3105 3106 3107
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3108

3109
	refs = btrfs_extent_refs(leaf, ei);
3110
	if (refs < refs_to_drop) {
3111 3112
		btrfs_crit(info,
		"trying to drop %d refs but we only have %llu for bytenr %llu",
J
Jeff Mahoney 已提交
3113
			  refs_to_drop, refs, bytenr);
3114 3115
		btrfs_abort_transaction(trans, -EUCLEAN);
		goto err_dump;
3116
	}
3117
	refs -= refs_to_drop;
3118

3119 3120 3121 3122 3123 3124
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3125
		 */
3126
		if (iref) {
3127 3128 3129 3130 3131 3132
			if (!found_extent) {
				btrfs_crit(info,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3133 3134 3135 3136 3137
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3138 3139
			ret = remove_extent_backref(trans, extent_root, path,
						    iref, refs_to_drop, is_data,
3140
						    &last_ref);
3141
			if (ret) {
3142
				btrfs_abort_transaction(trans, ret);
3143 3144
				goto out;
			}
3145
		}
3146
	} else {
3147
		/* In this branch refs == 1 */
3148
		if (found_extent) {
3149 3150 3151 3152 3153 3154 3155 3156 3157
			if (is_data && refs_to_drop !=
			    extent_data_ref_count(path, iref)) {
				btrfs_crit(info,
		"invalid refs_to_drop, current refs %u refs_to_drop %u",
					   extent_data_ref_count(path, iref),
					   refs_to_drop);
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3158
			if (iref) {
3159 3160 3161 3162 3163 3164 3165 3166
				if (path->slots[0] != extent_slot) {
					btrfs_crit(info,
"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
						   key.objectid, key.type,
						   key.offset);
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3167
			} else {
3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179
				/*
				 * No inline ref, we must be at SHARED_* item,
				 * And it's single ref, it must be:
				 * |	extent_slot	  ||extent_slot + 1|
				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
				 */
				if (path->slots[0] != extent_slot + 1) {
					btrfs_crit(info,
	"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3180 3181 3182
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3183
		}
3184

J
Josef Bacik 已提交
3185
		last_ref = 1;
3186 3187
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3188
		if (ret) {
3189
			btrfs_abort_transaction(trans, ret);
3190 3191
			goto out;
		}
3192
		btrfs_release_path(path);
3193

3194
		if (is_data) {
3195 3196 3197
			struct btrfs_root *csum_root;
			csum_root = btrfs_csum_root(info, bytenr);
			ret = btrfs_del_csums(trans, csum_root, bytenr,
3198
					      num_bytes);
3199
			if (ret) {
3200
				btrfs_abort_transaction(trans, ret);
3201 3202
				goto out;
			}
3203 3204
		}

3205
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3206
		if (ret) {
3207
			btrfs_abort_transaction(trans, ret);
3208 3209 3210
			goto out;
		}

3211
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, false);
3212
		if (ret) {
3213
			btrfs_abort_transaction(trans, ret);
3214 3215
			goto out;
		}
3216
	}
J
Josef Bacik 已提交
3217 3218
	btrfs_release_path(path);

3219
out:
3220
	btrfs_free_path(path);
3221
	return ret;
3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234
err_dump:
	/*
	 * Leaf dump can take up a lot of log buffer, so we only do full leaf
	 * dump for debug build.
	 */
	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
		btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
			   path->slots[0], extent_slot);
		btrfs_print_leaf(path->nodes[0]);
	}

	btrfs_free_path(path);
	return -EUCLEAN;
3235 3236
}

3237
/*
3238
 * when we free an block, it is possible (and likely) that we free the last
3239 3240 3241 3242 3243
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3244
				      u64 bytenr)
3245 3246 3247
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3248
	int ret = 0;
3249 3250 3251

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3252
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3253
	if (!head)
3254
		goto out_delayed_unlock;
3255

3256
	spin_lock(&head->lock);
3257
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3258 3259
		goto out;

J
Josef Bacik 已提交
3260 3261
	if (cleanup_extent_op(head) != NULL)
		goto out;
3262

3263 3264 3265 3266 3267 3268 3269
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3270
	btrfs_delete_ref_head(delayed_refs, head);
3271
	head->processing = 0;
3272

3273
	spin_unlock(&head->lock);
3274 3275
	spin_unlock(&delayed_refs->lock);

3276 3277 3278 3279
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3280
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3281
	mutex_unlock(&head->mutex);
3282
	btrfs_put_delayed_ref_head(head);
3283
	return ret;
3284
out:
3285
	spin_unlock(&head->lock);
3286 3287

out_delayed_unlock:
3288 3289 3290 3291
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3292
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3293
			   u64 root_id,
3294
			   struct extent_buffer *buf,
3295
			   u64 parent, int last_ref)
3296
{
3297
	struct btrfs_fs_info *fs_info = trans->fs_info;
3298
	struct btrfs_ref generic_ref = { 0 };
3299 3300
	int ret;

3301 3302 3303
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3304
			    root_id, 0, false);
3305

3306
	if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3307
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3308
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3309
		BUG_ON(ret); /* -ENOMEM */
3310 3311
	}

3312
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3313
		struct btrfs_block_group *cache;
3314
		bool must_pin = false;
3315

3316
		if (root_id != BTRFS_TREE_LOG_OBJECTID) {
3317
			ret = check_ref_cleanup(trans, buf->start);
3318 3319
			if (!ret) {
				btrfs_redirty_list_add(trans->transaction, buf);
3320
				goto out;
3321
			}
3322 3323
		}

3324
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3325

3326
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3327
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3328
			btrfs_put_block_group(cache);
3329
			goto out;
3330 3331
		}

3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345
		/*
		 * If this is a leaf and there are tree mod log users, we may
		 * have recorded mod log operations that point to this leaf.
		 * So we must make sure no one reuses this leaf's extent before
		 * mod log operations are applied to a node, otherwise after
		 * rewinding a node using the mod log operations we get an
		 * inconsistent btree, as the leaf's extent may now be used as
		 * a node or leaf for another different btree.
		 * We are safe from races here because at this point no other
		 * node or root points to this extent buffer, so if after this
		 * check a new tree mod log user joins, it will not be able to
		 * find a node pointing to this leaf and record operations that
		 * point to this leaf.
		 */
3346 3347 3348
		if (btrfs_header_level(buf) == 0 &&
		    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
			must_pin = true;
3349 3350

		if (must_pin || btrfs_is_zoned(fs_info)) {
3351 3352 3353 3354 3355 3356
			btrfs_redirty_list_add(trans->transaction, buf);
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
			btrfs_put_block_group(cache);
			goto out;
		}

3357 3358 3359
		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3360
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3361
		btrfs_put_block_group(cache);
3362
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3363 3364
	}
out:
3365 3366 3367 3368 3369 3370 3371
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3372 3373
}

3374
/* Can return -ENOMEM */
3375
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3376
{
3377
	struct btrfs_fs_info *fs_info = trans->fs_info;
3378 3379
	int ret;

3380
	if (btrfs_is_testing(fs_info))
3381
		return 0;
3382

3383 3384 3385 3386
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3387
	if ((ref->type == BTRFS_REF_METADATA &&
3388
	     ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3389
	    (ref->type == BTRFS_REF_DATA &&
3390
	     ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) {
3391
		/* unlocks the pinned mutex */
3392
		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3393
		ret = 0;
3394
	} else if (ref->type == BTRFS_REF_METADATA) {
3395
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3396
	} else {
3397
		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3398
	}
3399

3400
	if (!((ref->type == BTRFS_REF_METADATA &&
3401
	       ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) ||
3402
	      (ref->type == BTRFS_REF_DATA &&
3403
	       ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)))
3404
		btrfs_ref_tree_mod(fs_info, ref);
3405

3406 3407 3408
	return ret;
}

J
Josef Bacik 已提交
3409
enum btrfs_loop_type {
3410 3411 3412 3413
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3414 3415
};

3416
static inline void
3417
btrfs_lock_block_group(struct btrfs_block_group *cache,
3418 3419 3420 3421 3422 3423
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3424
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3425 3426 3427 3428 3429 3430 3431
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3432 3433
static struct btrfs_block_group *btrfs_lock_cluster(
		   struct btrfs_block_group *block_group,
3434 3435
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
3436
	__acquires(&cluster->refill_lock)
3437
{
3438
	struct btrfs_block_group *used_bg = NULL;
3439

3440
	spin_lock(&cluster->refill_lock);
3441 3442 3443 3444 3445 3446
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3447 3448
			return used_bg;

3449
		btrfs_get_block_group(used_bg);
3450

3451 3452
		if (!delalloc)
			return used_bg;
3453

3454 3455
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3456

3457
		spin_unlock(&cluster->refill_lock);
3458

3459 3460
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3461

3462 3463 3464
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3465

3466 3467 3468
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3469 3470 3471
}

static inline void
3472
btrfs_release_block_group(struct btrfs_block_group *cache,
3473 3474 3475 3476 3477 3478 3479
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3480 3481
enum btrfs_extent_allocation_policy {
	BTRFS_EXTENT_ALLOC_CLUSTERED,
3482
	BTRFS_EXTENT_ALLOC_ZONED,
3483 3484
};

3485 3486 3487 3488 3489 3490
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
N
Naohiro Aota 已提交
3491
	u64 ram_bytes;
3492
	u64 num_bytes;
3493
	u64 min_alloc_size;
3494 3495 3496 3497 3498 3499 3500 3501 3502
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;
3503 3504
	struct btrfs_free_cluster *last_ptr;
	bool use_cluster;
3505 3506 3507 3508

	bool have_caching_bg;
	bool orig_have_caching_bg;

3509 3510 3511
	/* Allocation is called for tree-log */
	bool for_treelog;

3512 3513 3514
	/* Allocation is called for data relocation */
	bool for_data_reloc;

3515 3516 3517
	/* RAID index, converted from flags */
	int index;

3518 3519 3520
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
3546

3547 3548 3549
	/* Hint where to start looking for an empty space */
	u64 hint_byte;

3550 3551
	/* Allocation policy */
	enum btrfs_extent_allocation_policy policy;
3552 3553
};

3554 3555 3556 3557 3558 3559 3560 3561 3562

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
3563
static int find_free_extent_clustered(struct btrfs_block_group *bg,
3564 3565
				      struct find_free_extent_ctl *ffe_ctl,
				      struct btrfs_block_group **cluster_bg_ret)
3566
{
3567
	struct btrfs_block_group *cluster_bg;
3568
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3581
			ffe_ctl->num_bytes, cluster_bg->start,
3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3627 3628
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3648
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3662 3663 3664 3665 3666
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
3667
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3668
					struct find_free_extent_ctl *ffe_ctl)
3669
{
3670
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3715 3716
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3717 3718 3719 3720 3721 3722 3723 3724 3725
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3726 3727 3728 3729 3730 3731 3732 3733
static int do_allocation_clustered(struct btrfs_block_group *block_group,
				   struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_block_group **bg_ret)
{
	int ret;

	/* We want to try and use the cluster allocator, so lets look there */
	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3734
		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3735 3736 3737 3738 3739
		if (ret >= 0 || ret == -EAGAIN)
			return ret;
		/* ret == -ENOENT case falls through */
	}

3740
	return find_free_extent_unclustered(block_group, ffe_ctl);
3741 3742
}

3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758
/*
 * Tree-log block group locking
 * ============================
 *
 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
 * indicates the starting address of a block group, which is reserved only
 * for tree-log metadata.
 *
 * Lock nesting
 * ============
 *
 * space_info::lock
 *   block_group::lock
 *     fs_info::treelog_bg_lock
 */

3759 3760 3761 3762 3763 3764 3765 3766 3767
/*
 * Simple allocator for sequential-only block group. It only allows sequential
 * allocation. No need to play with trees. This function also reserves the
 * bytes as in btrfs_add_reserved_bytes.
 */
static int do_allocation_zoned(struct btrfs_block_group *block_group,
			       struct find_free_extent_ctl *ffe_ctl,
			       struct btrfs_block_group **bg_ret)
{
3768
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3769 3770 3771 3772 3773
	struct btrfs_space_info *space_info = block_group->space_info;
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	u64 start = block_group->start;
	u64 num_bytes = ffe_ctl->num_bytes;
	u64 avail;
3774 3775
	u64 bytenr = block_group->start;
	u64 log_bytenr;
3776
	u64 data_reloc_bytenr;
3777
	int ret = 0;
3778
	bool skip = false;
3779 3780 3781

	ASSERT(btrfs_is_zoned(block_group->fs_info));

3782 3783 3784 3785 3786 3787
	/*
	 * Do not allow non-tree-log blocks in the dedicated tree-log block
	 * group, and vice versa.
	 */
	spin_lock(&fs_info->treelog_bg_lock);
	log_bytenr = fs_info->treelog_bg;
3788 3789 3790
	if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
			   (!ffe_ctl->for_treelog && bytenr == log_bytenr)))
		skip = true;
3791 3792 3793 3794
	spin_unlock(&fs_info->treelog_bg_lock);
	if (skip)
		return 1;

3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807
	/*
	 * Do not allow non-relocation blocks in the dedicated relocation block
	 * group, and vice versa.
	 */
	spin_lock(&fs_info->relocation_bg_lock);
	data_reloc_bytenr = fs_info->data_reloc_bg;
	if (data_reloc_bytenr &&
	    ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) ||
	     (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr)))
		skip = true;
	spin_unlock(&fs_info->relocation_bg_lock);
	if (skip)
		return 1;
3808

3809 3810 3811 3812
	/* Check RO and no space case before trying to activate it */
	spin_lock(&block_group->lock);
	if (block_group->ro ||
	    block_group->alloc_offset == block_group->zone_capacity) {
3813 3814 3815 3816 3817
		ret = 1;
		/*
		 * May need to clear fs_info->{treelog,data_reloc}_bg.
		 * Return the error after taking the locks.
		 */
3818 3819 3820
	}
	spin_unlock(&block_group->lock);

3821 3822 3823 3824 3825 3826 3827
	if (!ret && !btrfs_zone_activate(block_group)) {
		ret = 1;
		/*
		 * May need to clear fs_info->{treelog,data_reloc}_bg.
		 * Return the error after taking the locks.
		 */
	}
3828

3829 3830
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
3831
	spin_lock(&fs_info->treelog_bg_lock);
3832
	spin_lock(&fs_info->relocation_bg_lock);
3833

3834 3835 3836
	if (ret)
		goto out;

3837 3838 3839
	ASSERT(!ffe_ctl->for_treelog ||
	       block_group->start == fs_info->treelog_bg ||
	       fs_info->treelog_bg == 0);
3840 3841 3842
	ASSERT(!ffe_ctl->for_data_reloc ||
	       block_group->start == fs_info->data_reloc_bg ||
	       fs_info->data_reloc_bg == 0);
3843 3844 3845 3846 3847 3848

	if (block_group->ro) {
		ret = 1;
		goto out;
	}

3849 3850 3851 3852 3853 3854 3855 3856 3857 3858
	/*
	 * Do not allow currently using block group to be tree-log dedicated
	 * block group.
	 */
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
	    (block_group->used || block_group->reserved)) {
		ret = 1;
		goto out;
	}

3859 3860 3861 3862 3863 3864 3865 3866 3867 3868
	/*
	 * Do not allow currently used block group to be the data relocation
	 * dedicated block group.
	 */
	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg &&
	    (block_group->used || block_group->reserved)) {
		ret = 1;
		goto out;
	}

3869 3870
	WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity);
	avail = block_group->zone_capacity - block_group->alloc_offset;
3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883
	if (avail < num_bytes) {
		if (ffe_ctl->max_extent_size < avail) {
			/*
			 * With sequential allocator, free space is always
			 * contiguous
			 */
			ffe_ctl->max_extent_size = avail;
			ffe_ctl->total_free_space = avail;
		}
		ret = 1;
		goto out;
	}

3884 3885 3886
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
		fs_info->treelog_bg = block_group->start;

3887 3888 3889
	if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg)
		fs_info->data_reloc_bg = block_group->start;

3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903
	ffe_ctl->found_offset = start + block_group->alloc_offset;
	block_group->alloc_offset += num_bytes;
	spin_lock(&ctl->tree_lock);
	ctl->free_space -= num_bytes;
	spin_unlock(&ctl->tree_lock);

	/*
	 * We do not check if found_offset is aligned to stripesize. The
	 * address is anyway rewritten when using zone append writing.
	 */

	ffe_ctl->search_start = ffe_ctl->found_offset;

out:
3904 3905
	if (ret && ffe_ctl->for_treelog)
		fs_info->treelog_bg = 0;
3906 3907 3908
	if (ret && ffe_ctl->for_data_reloc)
		fs_info->data_reloc_bg = 0;
	spin_unlock(&fs_info->relocation_bg_lock);
3909
	spin_unlock(&fs_info->treelog_bg_lock);
3910 3911 3912 3913 3914
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);
	return ret;
}

3915 3916 3917 3918 3919 3920 3921
static int do_allocation(struct btrfs_block_group *block_group,
			 struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_block_group **bg_ret)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3922 3923
	case BTRFS_EXTENT_ALLOC_ZONED:
		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3924 3925 3926 3927 3928
	default:
		BUG();
	}
}

3929 3930 3931 3932 3933 3934 3935 3936 3937
static void release_block_group(struct btrfs_block_group *block_group,
				struct find_free_extent_ctl *ffe_ctl,
				int delalloc)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		ffe_ctl->retry_clustered = false;
		ffe_ctl->retry_unclustered = false;
		break;
3938 3939 3940
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3941 3942 3943 3944 3945 3946 3947 3948 3949
	default:
		BUG();
	}

	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
	       ffe_ctl->index);
	btrfs_release_block_group(block_group, delalloc);
}

3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968
static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_key *ins)
{
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

	if (!ffe_ctl->use_cluster && last_ptr) {
		spin_lock(&last_ptr->lock);
		last_ptr->window_start = ins->objectid;
		spin_unlock(&last_ptr->lock);
	}
}

static void found_extent(struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		found_extent_clustered(ffe_ctl, ins);
		break;
3969 3970 3971
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3972 3973 3974 3975 3976
	default:
		BUG();
	}
}

3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989
static bool can_allocate_chunk(struct btrfs_fs_info *fs_info,
			       struct find_free_extent_ctl *ffe_ctl)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return true;
	case BTRFS_EXTENT_ALLOC_ZONED:
		return true;
	default:
		BUG();
	}
}

3990 3991 3992 3993 3994 3995 3996 3997 3998 3999
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		/*
		 * If we can't allocate a new chunk we've already looped through
		 * at least once, move on to the NO_EMPTY_SIZE case.
		 */
		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
		return 0;
4000 4001 4002
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Give up here */
		return -ENOSPC;
4003 4004 4005 4006 4007
	default:
		BUG();
	}
}

4008 4009 4010 4011 4012 4013 4014 4015
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
4016
					bool full_search)
4017
{
4018
	struct btrfs_root *root = fs_info->chunk_root;
4019 4020 4021 4022 4023 4024 4025
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (ins->objectid) {
4026
		found_extent(ffe_ctl, ins);
4027 4028 4029
		return 0;
	}

4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048
	if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size &&
	    !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) {
		/*
		 * If we have enough free space left in an already active block
		 * group and we can't activate any other zone now, retry the
		 * active ones with a smaller allocation size.  Returning early
		 * from here will tell btrfs_reserve_extent() to haven the
		 * size.
		 */
		return -ENOSPC;
	}

	if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg)
		return 1;

	ffe_ctl->index++;
	if (ffe_ctl->index < BTRFS_NR_RAID_TYPES)
		return 1;

4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076
	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

4077 4078 4079 4080
			/*Check if allocation policy allows to create a new chunk */
			if (!can_allocate_chunk(fs_info, ffe_ctl))
				return -ENOSPC;

4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091
			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

4092 4093
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
4094 4095

			/* Do not bail out on ENOSPC since we can do more. */
4096 4097 4098
			if (ret == -ENOSPC)
				ret = chunk_allocation_failed(ffe_ctl);
			else if (ret < 0)
4099 4100 4101 4102 4103 4104 4105 4106 4107 4108
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4109 4110 4111
			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
				return -ENOSPC;

4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
					struct find_free_extent_ctl *ffe_ctl,
					struct btrfs_space_info *space_info,
					struct btrfs_key *ins)
{
	/*
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
	 */
	if (space_info->max_extent_size) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    ffe_ctl->num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&space_info->lock);
	}

	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
					       &ffe_ctl->empty_cluster);
	if (ffe_ctl->last_ptr) {
		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			ffe_ctl->hint_byte = last_ptr->window_start;
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			ffe_ctl->hint_byte = last_ptr->window_start;
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&last_ptr->lock);
	}

	return 0;
}

static int prepare_allocation(struct btrfs_fs_info *fs_info,
			      struct find_free_extent_ctl *ffe_ctl,
			      struct btrfs_space_info *space_info,
			      struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return prepare_allocation_clustered(fs_info, ffe_ctl,
						    space_info, ins);
4187
	case BTRFS_EXTENT_ALLOC_ZONED:
4188 4189 4190 4191 4192 4193
		if (ffe_ctl->for_treelog) {
			spin_lock(&fs_info->treelog_bg_lock);
			if (fs_info->treelog_bg)
				ffe_ctl->hint_byte = fs_info->treelog_bg;
			spin_unlock(&fs_info->treelog_bg_lock);
		}
4194 4195 4196 4197 4198 4199
		if (ffe_ctl->for_data_reloc) {
			spin_lock(&fs_info->relocation_bg_lock);
			if (fs_info->data_reloc_bg)
				ffe_ctl->hint_byte = fs_info->data_reloc_bg;
			spin_unlock(&fs_info->relocation_bg_lock);
		}
4200
		return 0;
4201 4202 4203 4204 4205
	default:
		BUG();
	}
}

4206 4207 4208
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
4209
 * ins->objectid == start position
4210
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4211
 * ins->offset == the size of the hole.
4212
 * Any available blocks before search_start are skipped.
4213 4214 4215
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
4230
 */
4231
static noinline int find_free_extent(struct btrfs_root *root,
N
Naohiro Aota 已提交
4232 4233
				     struct btrfs_key *ins,
				     struct find_free_extent_ctl *ffe_ctl)
4234
{
4235
	struct btrfs_fs_info *fs_info = root->fs_info;
4236
	int ret = 0;
4237
	int cache_block_group_error = 0;
4238
	struct btrfs_block_group *block_group = NULL;
4239
	struct btrfs_space_info *space_info;
4240
	bool full_search = false;
4241

N
Naohiro Aota 已提交
4242
	WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize);
4243

N
Naohiro Aota 已提交
4244 4245 4246 4247 4248 4249 4250 4251 4252
	ffe_ctl->search_start = 0;
	/* For clustered allocation */
	ffe_ctl->empty_cluster = 0;
	ffe_ctl->last_ptr = NULL;
	ffe_ctl->use_cluster = true;
	ffe_ctl->have_caching_bg = false;
	ffe_ctl->orig_have_caching_bg = false;
	ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags);
	ffe_ctl->loop = 0;
4253
	/* For clustered allocation */
N
Naohiro Aota 已提交
4254 4255 4256 4257 4258 4259 4260
	ffe_ctl->retry_clustered = false;
	ffe_ctl->retry_unclustered = false;
	ffe_ctl->cached = 0;
	ffe_ctl->max_extent_size = 0;
	ffe_ctl->total_free_space = 0;
	ffe_ctl->found_offset = 0;
	ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4261

4262
	if (btrfs_is_zoned(fs_info))
N
Naohiro Aota 已提交
4263
		ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED;
4264

4265
	ins->type = BTRFS_EXTENT_ITEM_KEY;
4266 4267
	ins->objectid = 0;
	ins->offset = 0;
4268

N
Naohiro Aota 已提交
4269 4270
	trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size,
			       ffe_ctl->flags);
J
Josef Bacik 已提交
4271

N
Naohiro Aota 已提交
4272
	space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags);
4273
	if (!space_info) {
N
Naohiro Aota 已提交
4274
		btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags);
4275 4276
		return -ENOSPC;
	}
J
Josef Bacik 已提交
4277

N
Naohiro Aota 已提交
4278
	ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins);
4279 4280
	if (ret < 0)
		return ret;
4281

N
Naohiro Aota 已提交
4282 4283 4284 4285
	ffe_ctl->search_start = max(ffe_ctl->search_start,
				    first_logical_byte(fs_info, 0));
	ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
	if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
4286
		block_group = btrfs_lookup_block_group(fs_info,
N
Naohiro Aota 已提交
4287
						       ffe_ctl->search_start);
J
Josef Bacik 已提交
4288 4289 4290
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
4291 4292 4293
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
4294
		 */
N
Naohiro Aota 已提交
4295
		if (block_group && block_group_bits(block_group, ffe_ctl->flags) &&
4296
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
4297
			down_read(&space_info->groups_sem);
4298 4299 4300 4301 4302 4303 4304 4305 4306 4307
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
4308
			} else {
N
Naohiro Aota 已提交
4309 4310 4311 4312
				ffe_ctl->index = btrfs_bg_flags_to_raid_index(
							block_group->flags);
				btrfs_lock_block_group(block_group,
						       ffe_ctl->delalloc);
4313
				goto have_block_group;
4314
			}
J
Josef Bacik 已提交
4315
		} else if (block_group) {
4316
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
4317
		}
4318
	}
J
Josef Bacik 已提交
4319
search:
N
Naohiro Aota 已提交
4320 4321 4322
	ffe_ctl->have_caching_bg = false;
	if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) ||
	    ffe_ctl->index == 0)
4323
		full_search = true;
4324
	down_read(&space_info->groups_sem);
4325
	list_for_each_entry(block_group,
N
Naohiro Aota 已提交
4326
			    &space_info->block_groups[ffe_ctl->index], list) {
4327 4328
		struct btrfs_block_group *bg_ret;

4329
		/* If the block group is read-only, we can skip it entirely. */
4330
		if (unlikely(block_group->ro)) {
N
Naohiro Aota 已提交
4331
			if (ffe_ctl->for_treelog)
4332
				btrfs_clear_treelog_bg(block_group);
4333 4334
			if (ffe_ctl->for_data_reloc)
				btrfs_clear_data_reloc_bg(block_group);
4335
			continue;
4336
		}
4337

N
Naohiro Aota 已提交
4338 4339
		btrfs_grab_block_group(block_group, ffe_ctl->delalloc);
		ffe_ctl->search_start = block_group->start;
4340

4341 4342 4343 4344 4345
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
N
Naohiro Aota 已提交
4346
		if (!block_group_bits(block_group, ffe_ctl->flags)) {
4347
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4348
				BTRFS_BLOCK_GROUP_RAID1_MASK |
4349
				BTRFS_BLOCK_GROUP_RAID56_MASK |
4350 4351 4352 4353 4354 4355 4356
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
N
Naohiro Aota 已提交
4357
			if ((ffe_ctl->flags & extra) && !(block_group->flags & extra))
4358
				goto loop;
4359 4360 4361 4362 4363 4364

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
N
Naohiro Aota 已提交
4365
			btrfs_release_block_group(block_group, ffe_ctl->delalloc);
4366
			continue;
4367 4368
		}

J
Josef Bacik 已提交
4369
have_block_group:
N
Naohiro Aota 已提交
4370 4371 4372
		ffe_ctl->cached = btrfs_block_group_done(block_group);
		if (unlikely(!ffe_ctl->cached)) {
			ffe_ctl->have_caching_bg = true;
4373
			ret = btrfs_cache_block_group(block_group, 0);
4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387

			/*
			 * If we get ENOMEM here or something else we want to
			 * try other block groups, because it may not be fatal.
			 * However if we can't find anything else we need to
			 * save our return here so that we return the actual
			 * error that caused problems, not ENOSPC.
			 */
			if (ret < 0) {
				if (!cache_block_group_error)
					cache_block_group_error = ret;
				ret = 0;
				goto loop;
			}
4388
			ret = 0;
J
Josef Bacik 已提交
4389 4390
		}

4391 4392
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
4393

4394
		bg_ret = NULL;
N
Naohiro Aota 已提交
4395
		ret = do_allocation(block_group, ffe_ctl, &bg_ret);
4396 4397
		if (ret == 0) {
			if (bg_ret && bg_ret != block_group) {
N
Naohiro Aota 已提交
4398 4399
				btrfs_release_block_group(block_group,
							  ffe_ctl->delalloc);
4400
				block_group = bg_ret;
4401
			}
4402
		} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
4403
			goto have_block_group;
4404
		} else if (ret > 0) {
4405
			goto loop;
4406 4407 4408
		}

		/* Checks */
N
Naohiro Aota 已提交
4409 4410
		ffe_ctl->search_start = round_up(ffe_ctl->found_offset,
						 fs_info->stripesize);
4411

J
Josef Bacik 已提交
4412
		/* move on to the next group */
N
Naohiro Aota 已提交
4413
		if (ffe_ctl->search_start + ffe_ctl->num_bytes >
4414
		    block_group->start + block_group->length) {
4415
			btrfs_add_free_space_unused(block_group,
N
Naohiro Aota 已提交
4416 4417
					    ffe_ctl->found_offset,
					    ffe_ctl->num_bytes);
J
Josef Bacik 已提交
4418
			goto loop;
4419
		}
4420

N
Naohiro Aota 已提交
4421
		if (ffe_ctl->found_offset < ffe_ctl->search_start)
4422
			btrfs_add_free_space_unused(block_group,
N
Naohiro Aota 已提交
4423 4424
					ffe_ctl->found_offset,
					ffe_ctl->search_start - ffe_ctl->found_offset);
J
Josef Bacik 已提交
4425

N
Naohiro Aota 已提交
4426 4427 4428
		ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes,
					       ffe_ctl->num_bytes,
					       ffe_ctl->delalloc);
4429
		if (ret == -EAGAIN) {
4430
			btrfs_add_free_space_unused(block_group,
N
Naohiro Aota 已提交
4431 4432
					ffe_ctl->found_offset,
					ffe_ctl->num_bytes);
J
Josef Bacik 已提交
4433
			goto loop;
J
Josef Bacik 已提交
4434
		}
4435
		btrfs_inc_block_group_reservations(block_group);
4436

4437
		/* we are all good, lets return */
N
Naohiro Aota 已提交
4438 4439
		ins->objectid = ffe_ctl->search_start;
		ins->offset = ffe_ctl->num_bytes;
4440

N
Naohiro Aota 已提交
4441 4442 4443
		trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start,
					   ffe_ctl->num_bytes);
		btrfs_release_block_group(block_group, ffe_ctl->delalloc);
J
Josef Bacik 已提交
4444 4445
		break;
loop:
N
Naohiro Aota 已提交
4446
		release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc);
4447
		cond_resched();
J
Josef Bacik 已提交
4448 4449 4450
	}
	up_read(&space_info->groups_sem);

N
Naohiro Aota 已提交
4451
	ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search);
4452
	if (ret > 0)
4453 4454
		goto search;

4455
	if (ret == -ENOSPC && !cache_block_group_error) {
4456 4457 4458 4459
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
N
Naohiro Aota 已提交
4460 4461
		if (!ffe_ctl->max_extent_size)
			ffe_ctl->max_extent_size = ffe_ctl->total_free_space;
4462
		spin_lock(&space_info->lock);
N
Naohiro Aota 已提交
4463
		space_info->max_extent_size = ffe_ctl->max_extent_size;
4464
		spin_unlock(&space_info->lock);
N
Naohiro Aota 已提交
4465
		ins->offset = ffe_ctl->max_extent_size;
4466 4467
	} else if (ret == -ENOSPC) {
		ret = cache_block_group_error;
4468
	}
C
Chris Mason 已提交
4469
	return ret;
4470
}
4471

4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4517
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4518 4519
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4520
			 struct btrfs_key *ins, int is_data, int delalloc)
4521
{
4522
	struct btrfs_fs_info *fs_info = root->fs_info;
N
Naohiro Aota 已提交
4523
	struct find_free_extent_ctl ffe_ctl = {};
4524
	bool final_tried = num_bytes == min_alloc_size;
4525
	u64 flags;
4526
	int ret;
4527
	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4528
	bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data);
4529

4530
	flags = get_alloc_profile_by_root(root, is_data);
4531
again:
4532
	WARN_ON(num_bytes < fs_info->sectorsize);
N
Naohiro Aota 已提交
4533 4534 4535

	ffe_ctl.ram_bytes = ram_bytes;
	ffe_ctl.num_bytes = num_bytes;
4536
	ffe_ctl.min_alloc_size = min_alloc_size;
N
Naohiro Aota 已提交
4537 4538 4539 4540 4541
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.hint_byte = hint_byte;
	ffe_ctl.for_treelog = for_treelog;
4542
	ffe_ctl.for_data_reloc = for_data_reloc;
N
Naohiro Aota 已提交
4543 4544

	ret = find_free_extent(root, ins, &ffe_ctl);
4545
	if (!ret && !is_data) {
4546
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4547
	} else if (ret == -ENOSPC) {
4548 4549
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4550
			num_bytes = round_down(num_bytes,
4551
					       fs_info->sectorsize);
4552
			num_bytes = max(num_bytes, min_alloc_size);
4553
			ram_bytes = num_bytes;
4554 4555 4556
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4557
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4558 4559
			struct btrfs_space_info *sinfo;

4560
			sinfo = btrfs_find_space_info(fs_info, flags);
4561
			btrfs_err(fs_info,
4562 4563
	"allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d",
				  flags, num_bytes, for_treelog, for_data_reloc);
4564
			if (sinfo)
4565 4566
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4567
		}
4568
	}
J
Josef Bacik 已提交
4569 4570

	return ret;
4571 4572
}

4573 4574
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
			       u64 start, u64 len, int delalloc)
4575
{
4576
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
4577

4578
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4579
	if (!cache) {
4580 4581
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4582 4583
		return -ENOSPC;
	}
4584

4585 4586 4587 4588
	btrfs_add_free_space(cache, start, len);
	btrfs_free_reserved_bytes(cache, len, delalloc);
	trace_btrfs_reserved_extent_free(fs_info, start, len);

4589
	btrfs_put_block_group(cache);
4590
	return 0;
4591 4592
}

4593 4594
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
			      u64 len)
4595
{
4596
	struct btrfs_block_group *cache;
4597
	int ret = 0;
4598

4599
	cache = btrfs_lookup_block_group(trans->fs_info, start);
4600
	if (!cache) {
4601 4602
		btrfs_err(trans->fs_info, "unable to find block group for %llu",
			  start);
4603 4604 4605
		return -ENOSPC;
	}

4606
	ret = pin_down_extent(trans, cache, start, len, 1);
4607
	btrfs_put_block_group(cache);
4608
	return ret;
4609 4610
}

4611 4612 4613 4614
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4615
{
4616
	struct btrfs_fs_info *fs_info = trans->fs_info;
4617
	struct btrfs_root *extent_root;
4618 4619
	int ret;
	struct btrfs_extent_item *extent_item;
4620
	struct btrfs_extent_inline_ref *iref;
4621
	struct btrfs_path *path;
4622 4623 4624
	struct extent_buffer *leaf;
	int type;
	u32 size;
4625

4626 4627 4628 4629
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4630

4631
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4632 4633

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4634 4635
	if (!path)
		return -ENOMEM;
4636

4637 4638
	extent_root = btrfs_extent_root(fs_info, ins->objectid);
	ret = btrfs_insert_empty_item(trans, extent_root, path, ins, size);
4639 4640 4641 4642
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4643

4644 4645
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4646
				     struct btrfs_extent_item);
4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4667 4668

	btrfs_mark_buffer_dirty(path->nodes[0]);
4669
	btrfs_free_path(path);
4670

4671
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4672 4673 4674
	if (ret)
		return ret;

4675
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true);
4676
	if (ret) { /* -ENOENT, logic error */
4677
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4678
			ins->objectid, ins->offset);
4679 4680
		BUG();
	}
4681
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4682 4683 4684
	return ret;
}

4685
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4686
				     struct btrfs_delayed_ref_node *node,
4687
				     struct btrfs_delayed_extent_op *extent_op)
4688
{
4689
	struct btrfs_fs_info *fs_info = trans->fs_info;
4690
	struct btrfs_root *extent_root;
4691
	int ret;
4692
	struct btrfs_extent_item *extent_item;
4693
	struct btrfs_key extent_key;
4694 4695 4696 4697
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4698
	struct btrfs_delayed_tree_ref *ref;
4699
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4700
	u64 num_bytes;
4701
	u64 flags = extent_op->flags_to_set;
4702
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4703

4704 4705 4706 4707 4708 4709 4710 4711 4712 4713
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4714
		size += sizeof(*block_info);
4715 4716
		num_bytes = node->num_bytes;
	}
4717

4718
	path = btrfs_alloc_path();
4719
	if (!path)
4720
		return -ENOMEM;
4721

4722 4723 4724
	extent_root = btrfs_extent_root(fs_info, extent_key.objectid);
	ret = btrfs_insert_empty_item(trans, extent_root, path, &extent_key,
				      size);
4725
	if (ret) {
4726
		btrfs_free_path(path);
4727 4728
		return ret;
	}
4729 4730 4731 4732 4733 4734 4735 4736 4737

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4738 4739 4740 4741
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4742
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4743
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4744 4745
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4746

4747
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4748 4749
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4750
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4751 4752 4753
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4754
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4755 4756 4757 4758 4759
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4760 4761
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4762 4763 4764
	if (ret)
		return ret;

4765
	ret = btrfs_update_block_group(trans, extent_key.objectid,
4766
				       fs_info->nodesize, true);
4767
	if (ret) { /* -ENOENT, logic error */
4768
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4769
			extent_key.objectid, extent_key.offset);
4770 4771
		BUG();
	}
J
Josef Bacik 已提交
4772

4773
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4774
					  fs_info->nodesize);
4775 4776 4777 4778
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4779
				     struct btrfs_root *root, u64 owner,
4780 4781
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4782
{
4783
	struct btrfs_ref generic_ref = { 0 };
4784

4785
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4786

4787 4788
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
4789 4790
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner,
			    offset, 0, false);
4791
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4792 4793

	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4794
}
4795 4796 4797 4798 4799 4800

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4801 4802 4803
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4804
{
4805
	struct btrfs_fs_info *fs_info = trans->fs_info;
4806
	int ret;
4807
	struct btrfs_block_group *block_group;
4808
	struct btrfs_space_info *space_info;
4809

4810 4811
	/*
	 * Mixed block groups will exclude before processing the log so we only
4812
	 * need to do the exclude dance if this fs isn't mixed.
4813
	 */
4814
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4815 4816
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4817
		if (ret)
4818
			return ret;
4819 4820
	}

4821
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4822 4823 4824
	if (!block_group)
		return -EINVAL;

4825 4826 4827 4828 4829 4830 4831 4832
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4833 4834
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4835
	if (ret)
4836
		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4837
	btrfs_put_block_group(block_group);
4838 4839 4840
	return ret;
}

4841 4842
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4843 4844
		      u64 bytenr, int level, u64 owner,
		      enum btrfs_lock_nesting nest)
4845
{
4846
	struct btrfs_fs_info *fs_info = root->fs_info;
4847 4848
	struct extent_buffer *buf;

4849
	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4850 4851 4852
	if (IS_ERR(buf))
		return buf;

4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4866 4867 4868 4869 4870
	/*
	 * This needs to stay, because we could allocate a freed block from an
	 * old tree into a new tree, so we need to make sure this new block is
	 * set to the appropriate level and owner.
	 */
4871
	btrfs_set_buffer_lockdep_class(owner, buf, level);
4872
	__btrfs_tree_lock(buf, nest);
4873
	btrfs_clean_tree_block(buf);
4874
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4875
	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4876

4877
	set_extent_buffer_uptodate(buf);
4878

4879 4880 4881 4882 4883 4884
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4885
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4886
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4887
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4888
		buf->log_index = root->log_transid % 2;
4889 4890
		/*
		 * we allow two log transactions at a time, use different
4891
		 * EXTENT bit to differentiate dirty pages.
4892
		 */
4893
		if (buf->log_index == 0)
4894 4895 4896 4897
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4898
					buf->start + buf->len - 1);
4899
	} else {
4900
		buf->log_index = -1;
4901
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4902
			 buf->start + buf->len - 1, GFP_NOFS);
4903
	}
4904
	/* this returns a buffer locked for blocking */
4905 4906 4907
	return buf;
}

4908
/*
4909
 * finds a free extent and does all the dirty work required for allocation
4910
 * returns the tree buffer or an ERR_PTR on error.
4911
 */
4912
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4913 4914 4915 4916
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
4917 4918
					     u64 empty_size,
					     enum btrfs_lock_nesting nest)
4919
{
4920
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4921
	struct btrfs_key ins;
4922
	struct btrfs_block_rsv *block_rsv;
4923
	struct extent_buffer *buf;
4924
	struct btrfs_delayed_extent_op *extent_op;
4925
	struct btrfs_ref generic_ref = { 0 };
4926 4927
	u64 flags = 0;
	int ret;
4928 4929
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4930

4931
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4932
	if (btrfs_is_testing(fs_info)) {
4933
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4934
					    level, root_objectid, nest);
4935 4936 4937 4938
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4939
#endif
4940

4941
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4942 4943 4944
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4945
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4946
				   empty_size, hint, &ins, 0, 0);
4947 4948
	if (ret)
		goto out_unuse;
4949

4950
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4951
				    root_objectid, nest);
4952 4953 4954 4955
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4956 4957 4958 4959 4960 4961 4962 4963 4964

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4965
		extent_op = btrfs_alloc_delayed_extent_op();
4966 4967 4968 4969
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4970 4971 4972 4973 4974
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4975 4976 4977
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4978
		extent_op->level = level;
4979

4980 4981
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
4982 4983
		btrfs_init_tree_ref(&generic_ref, level, root_objectid,
				    root->root_key.objectid, false);
4984
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4985
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4986 4987
		if (ret)
			goto out_free_delayed;
4988
	}
4989
	return buf;
4990 4991 4992 4993

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
4994
	btrfs_tree_unlock(buf);
4995 4996
	free_extent_buffer(buf);
out_free_reserved:
4997
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4998
out_unuse:
4999
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
5000
	return ERR_PTR(ret);
5001
}
5002

5003 5004 5005 5006
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
5007 5008
	struct btrfs_key drop_progress;
	int drop_level;
5009 5010 5011 5012 5013
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
5014 5015
	int reada_slot;
	int reada_count;
5016
	int restarted;
5017 5018 5019 5020 5021
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
5022 5023 5024 5025
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
5026
{
5027
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
5028 5029 5030
	u64 bytenr;
	u64 generation;
	u64 refs;
5031
	u64 flags;
5032
	u32 nritems;
Y
Yan, Zheng 已提交
5033 5034
	struct btrfs_key key;
	struct extent_buffer *eb;
5035
	int ret;
Y
Yan, Zheng 已提交
5036 5037
	int slot;
	int nread = 0;
5038

Y
Yan, Zheng 已提交
5039 5040 5041 5042 5043 5044
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
5045
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
5046
	}
5047

Y
Yan, Zheng 已提交
5048 5049
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
5050

Y
Yan, Zheng 已提交
5051 5052 5053
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
5054

C
Chris Mason 已提交
5055
		cond_resched();
Y
Yan, Zheng 已提交
5056 5057
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
5058

Y
Yan, Zheng 已提交
5059 5060
		if (slot == path->slots[wc->level])
			goto reada;
5061

Y
Yan, Zheng 已提交
5062 5063
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
5064 5065
			continue;

5066
		/* We don't lock the tree block, it's OK to be racy here */
5067
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
5068 5069
					       wc->level - 1, 1, &refs,
					       &flags);
5070 5071 5072
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
5073 5074
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
5075 5076 5077
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
5078

5079 5080 5081
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
5082 5083 5084 5085 5086 5087 5088 5089
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
5090 5091 5092 5093
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
5094
		}
Y
Yan, Zheng 已提交
5095
reada:
5096
		btrfs_readahead_node_child(eb, slot);
Y
Yan, Zheng 已提交
5097
		nread++;
C
Chris Mason 已提交
5098
	}
Y
Yan, Zheng 已提交
5099
	wc->reada_slot = slot;
C
Chris Mason 已提交
5100
}
5101

Y
Yan Zheng 已提交
5102
/*
L
Liu Bo 已提交
5103
 * helper to process tree block while walking down the tree.
5104 5105 5106 5107 5108
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
5109
 */
5110
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5111
				   struct btrfs_root *root,
5112
				   struct btrfs_path *path,
5113
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
5114
{
5115
	struct btrfs_fs_info *fs_info = root->fs_info;
5116 5117 5118
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
5119 5120
	int ret;

5121 5122 5123
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
5124

5125 5126 5127 5128
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
5129 5130 5131
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5132
		BUG_ON(!path->locks[level]);
5133
		ret = btrfs_lookup_extent_info(trans, fs_info,
5134
					       eb->start, level, 1,
5135 5136
					       &wc->refs[level],
					       &wc->flags[level]);
5137 5138 5139
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
5140 5141
		BUG_ON(wc->refs[level] == 0);
	}
5142

5143 5144 5145
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
5146

5147
		if (path->locks[level] && !wc->keep_locks) {
5148
			btrfs_tree_unlock_rw(eb, path->locks[level]);
5149 5150 5151 5152
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
5153

5154 5155 5156
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
5157
		ret = btrfs_inc_ref(trans, root, eb, 1);
5158
		BUG_ON(ret); /* -ENOMEM */
5159
		ret = btrfs_dec_ref(trans, root, eb, 0);
5160
		BUG_ON(ret); /* -ENOMEM */
5161
		ret = btrfs_set_disk_extent_flags(trans, eb, flag,
5162
						  btrfs_header_level(eb), 0);
5163
		BUG_ON(ret); /* -ENOMEM */
5164 5165 5166 5167 5168 5169 5170 5171
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
5172
		btrfs_tree_unlock_rw(eb, path->locks[level]);
5173 5174 5175 5176 5177
		path->locks[level] = 0;
	}
	return 0;
}

5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
5205
/*
L
Liu Bo 已提交
5206
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
5221
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
5222
{
5223
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
5224 5225 5226 5227
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
5228
	struct btrfs_key first_key;
5229
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
5230 5231 5232 5233
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
5234
	bool need_account = false;
Y
Yan, Zheng 已提交
5235 5236 5237 5238 5239 5240 5241 5242 5243

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
5244 5245
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5246
		return 1;
5247
	}
Y
Yan, Zheng 已提交
5248 5249

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5250 5251
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
5252

5253
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
5254
	if (!next) {
5255 5256
		next = btrfs_find_create_tree_block(fs_info, bytenr,
				root->root_key.objectid, level - 1);
5257 5258
		if (IS_ERR(next))
			return PTR_ERR(next);
Y
Yan, Zheng 已提交
5259 5260 5261 5262
		reada = 1;
	}
	btrfs_tree_lock(next);

5263
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5264 5265
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
5266 5267
	if (ret < 0)
		goto out_unlock;
5268

5269
	if (unlikely(wc->refs[level - 1] == 0)) {
5270
		btrfs_err(fs_info, "Missing references.");
5271 5272
		ret = -EIO;
		goto out_unlock;
5273
	}
5274
	*lookup_info = 0;
Y
Yan, Zheng 已提交
5275

5276
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
5277
		if (wc->refs[level - 1] > 1) {
5278
			need_account = true;
5279 5280 5281 5282
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
5296 5297 5298 5299
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
5300 5301
	}

5302
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
5303 5304 5305
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
5306
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5307 5308 5309 5310 5311
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
5312 5313
		next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
				       generation, level - 1, &first_key);
5314 5315 5316
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
5317
			free_extent_buffer(next);
5318
			return -EIO;
5319
		}
Y
Yan, Zheng 已提交
5320 5321 5322 5323
		btrfs_tree_lock(next);
	}

	level--;
5324 5325 5326 5327 5328 5329
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
5330 5331
	path->nodes[level] = next;
	path->slots[level] = 0;
5332
	path->locks[level] = BTRFS_WRITE_LOCK;
Y
Yan, Zheng 已提交
5333 5334 5335 5336 5337 5338 5339
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
5340 5341 5342 5343
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
5344
			ASSERT(root->root_key.objectid ==
5345
			       btrfs_header_owner(path->nodes[level]));
5346 5347 5348 5349 5350 5351 5352
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
5353 5354
			parent = 0;
		}
Y
Yan, Zheng 已提交
5355

5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

5373 5374 5375 5376 5377 5378 5379
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
5380
			ret = btrfs_qgroup_trace_subtree(trans, next,
5381
							 generation, level - 1);
5382
			if (ret) {
5383
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
5384 5385
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
5386 5387
			}
		}
5388 5389 5390 5391 5392 5393 5394 5395 5396 5397

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

5398 5399
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
5400 5401
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid,
				    0, false);
5402
		ret = btrfs_free_extent(trans, &ref);
5403 5404
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
5405
	}
5406
no_delete:
5407 5408 5409 5410
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
5411 5412
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
5413 5414

	return ret;
Y
Yan, Zheng 已提交
5415 5416
}

5417
/*
L
Liu Bo 已提交
5418
 * helper to process tree block while walking up the tree.
5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
5434
	struct btrfs_fs_info *fs_info = root->fs_info;
5435
	int ret;
5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5461
			path->locks[level] = BTRFS_WRITE_LOCK;
5462

5463
			ret = btrfs_lookup_extent_info(trans, fs_info,
5464
						       eb->start, level, 1,
5465 5466
						       &wc->refs[level],
						       &wc->flags[level]);
5467 5468
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5469
				path->locks[level] = 0;
5470 5471
				return ret;
			}
5472 5473
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5474
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5475
				path->locks[level] = 0;
5476 5477
				return 1;
			}
Y
Yan Zheng 已提交
5478
		}
5479
	}
Y
Yan Zheng 已提交
5480

5481 5482
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5483

5484 5485 5486
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5487
				ret = btrfs_dec_ref(trans, root, eb, 1);
5488
			else
5489
				ret = btrfs_dec_ref(trans, root, eb, 0);
5490
			BUG_ON(ret); /* -ENOMEM */
5491 5492 5493 5494 5495
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5496
					     ret);
5497
				}
5498
			}
5499
		}
5500
		/* make block locked assertion in btrfs_clean_tree_block happy */
5501 5502 5503
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5504
			path->locks[level] = BTRFS_WRITE_LOCK;
5505
		}
5506
		btrfs_clean_tree_block(eb);
5507 5508 5509 5510 5511
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5512 5513
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5514 5515 5516
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5517 5518 5519
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5520 5521
	}

5522 5523
	btrfs_free_tree_block(trans, btrfs_root_id(root), eb, parent,
			      wc->refs[level] == 1);
5524 5525 5526
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5527
	return 0;
5528 5529 5530 5531 5532

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5533 5534 5535 5536 5537 5538 5539 5540
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5541
	int lookup_info = 1;
5542 5543 5544
	int ret;

	while (level >= 0) {
5545
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5546 5547 5548 5549 5550 5551
		if (ret > 0)
			break;

		if (level == 0)
			break;

5552 5553 5554 5555
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5556
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5557 5558 5559
		if (ret > 0) {
			path->slots[level]++;
			continue;
5560 5561
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5562
		level = wc->level;
Y
Yan Zheng 已提交
5563 5564 5565 5566
	}
	return 0;
}

C
Chris Mason 已提交
5567
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5568
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5569
				 struct btrfs_path *path,
5570
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5571
{
5572
	int level = wc->level;
C
Chris Mason 已提交
5573
	int ret;
5574

5575 5576 5577 5578 5579 5580
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5581 5582
			return 0;
		} else {
5583 5584 5585
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5586 5587
			if (ret < 0)
				return ret;
5588

5589
			if (path->locks[level]) {
5590 5591
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5592
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5593
			}
5594 5595 5596
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5597 5598 5599 5600 5601
		}
	}
	return 1;
}

C
Chris Mason 已提交
5602
/*
5603 5604 5605 5606 5607 5608 5609 5610 5611
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5612 5613
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5614
 */
5615
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
C
Chris Mason 已提交
5616
{
5617
	struct btrfs_fs_info *fs_info = root->fs_info;
5618
	struct btrfs_path *path;
5619
	struct btrfs_trans_handle *trans;
5620
	struct btrfs_root *tree_root = fs_info->tree_root;
5621
	struct btrfs_root_item *root_item = &root->root_item;
5622 5623 5624 5625 5626
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5627
	bool root_dropped = false;
C
Chris Mason 已提交
5628

5629
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5630

5631
	path = btrfs_alloc_path();
5632 5633 5634 5635
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5636

5637
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5638 5639
	if (!wc) {
		btrfs_free_path(path);
5640 5641
		err = -ENOMEM;
		goto out;
5642
	}
5643

5644 5645 5646 5647 5648 5649 5650 5651
	/*
	 * Use join to avoid potential EINTR from transaction start. See
	 * wait_reserve_ticket and the whole reservation callchain.
	 */
	if (for_reloc)
		trans = btrfs_join_transaction(tree_root);
	else
		trans = btrfs_start_transaction(tree_root, 0);
5652 5653 5654 5655
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5656

5657 5658 5659 5660
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5661 5662 5663 5664 5665 5666 5667 5668 5669
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5670
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5671
		level = btrfs_header_level(root->node);
5672
		path->nodes[level] = btrfs_lock_root_node(root);
5673
		path->slots[level] = 0;
5674
		path->locks[level] = BTRFS_WRITE_LOCK;
5675 5676
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5677 5678
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5679 5680 5681
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5682
		level = btrfs_root_drop_level(root_item);
5683
		BUG_ON(level == 0);
5684
		path->lowest_level = level;
5685 5686 5687 5688
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5689
			goto out_end_trans;
5690
		}
Y
Yan, Zheng 已提交
5691
		WARN_ON(ret > 0);
5692

5693 5694 5695 5696
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5697
		btrfs_unlock_up_safe(path, 0);
5698 5699 5700 5701

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5702
			path->locks[level] = BTRFS_WRITE_LOCK;
5703

5704
			ret = btrfs_lookup_extent_info(trans, fs_info,
5705
						path->nodes[level]->start,
5706
						level, 1, &wc->refs[level],
5707
						&wc->flags[level]);
5708 5709 5710 5711
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5712 5713
			BUG_ON(wc->refs[level] == 0);

5714
			if (level == btrfs_root_drop_level(root_item))
5715 5716 5717
				break;

			btrfs_tree_unlock(path->nodes[level]);
5718
			path->locks[level] = 0;
5719 5720 5721
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5722
	}
5723

5724
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5725 5726 5727 5728 5729
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5730
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5731

C
Chris Mason 已提交
5732
	while (1) {
D
David Sterba 已提交
5733

5734 5735 5736
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5737
			break;
5738
		}
C
Chris Mason 已提交
5739

5740 5741 5742
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5743
			break;
5744 5745 5746 5747
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5748 5749
			break;
		}
5750 5751

		if (wc->stage == DROP_REFERENCE) {
5752 5753 5754 5755 5756 5757 5758
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
5759
		btrfs_set_root_drop_level(root_item, wc->drop_level);
5760 5761

		BUG_ON(wc->level == 0);
5762
		if (btrfs_should_end_transaction(trans) ||
5763
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5764 5765 5766
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5767
			if (ret) {
5768
				btrfs_abort_transaction(trans, ret);
5769 5770 5771
				err = ret;
				goto out_end_trans;
			}
5772

5773
			btrfs_end_transaction_throttle(trans);
5774
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5775 5776
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5777 5778 5779 5780
				err = -EAGAIN;
				goto out_free;
			}

5781 5782 5783 5784 5785 5786 5787 5788 5789
		       /*
			* Use join to avoid potential EINTR from transaction
			* start. See wait_reserve_ticket and the whole
			* reservation callchain.
			*/
			if (for_reloc)
				trans = btrfs_join_transaction(tree_root);
			else
				trans = btrfs_start_transaction(tree_root, 0);
5790 5791 5792 5793
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5794
		}
C
Chris Mason 已提交
5795
	}
5796
	btrfs_release_path(path);
5797 5798
	if (err)
		goto out_end_trans;
5799

5800
	ret = btrfs_del_root(trans, &root->root_key);
5801
	if (ret) {
5802
		btrfs_abort_transaction(trans, ret);
5803
		err = ret;
5804 5805
		goto out_end_trans;
	}
5806

5807
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5808 5809
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5810
		if (ret < 0) {
5811
			btrfs_abort_transaction(trans, ret);
5812 5813 5814
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5815 5816 5817 5818 5819 5820 5821
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5822 5823 5824
		}
	}

5825 5826 5827 5828 5829 5830 5831 5832
	/*
	 * This subvolume is going to be completely dropped, and won't be
	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
	 * commit transaction time.  So free it here manually.
	 */
	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
	btrfs_qgroup_free_meta_all_pertrans(root);

5833
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5834
		btrfs_add_dropped_root(trans, root);
5835
	else
5836
		btrfs_put_root(root);
5837
	root_dropped = true;
5838
out_end_trans:
5839
	btrfs_end_transaction_throttle(trans);
5840
out_free:
5841
	kfree(wc);
5842
	btrfs_free_path(path);
5843
out:
5844 5845 5846 5847 5848 5849 5850
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5851
	if (!for_reloc && !root_dropped)
5852
		btrfs_add_dead_root(root);
5853
	return err;
C
Chris Mason 已提交
5854
}
C
Chris Mason 已提交
5855

5856 5857 5858 5859
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5860
 * only used by relocation code
5861
 */
Y
Yan Zheng 已提交
5862 5863 5864 5865 5866
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5867
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5868
	struct btrfs_path *path;
5869
	struct walk_control *wc;
Y
Yan Zheng 已提交
5870 5871 5872 5873 5874
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5875 5876
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5877
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5878 5879
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5880

5881
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5882 5883 5884 5885
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5886

5887
	btrfs_assert_tree_write_locked(parent);
Y
Yan Zheng 已提交
5888
	parent_level = btrfs_header_level(parent);
D
David Sterba 已提交
5889
	atomic_inc(&parent->refs);
Y
Yan Zheng 已提交
5890 5891 5892
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5893
	btrfs_assert_tree_write_locked(node);
Y
Yan Zheng 已提交
5894 5895 5896
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5897
	path->locks[level] = BTRFS_WRITE_LOCK;
5898 5899 5900 5901 5902 5903 5904 5905

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5906
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5907 5908

	while (1) {
5909 5910
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5911 5912
			ret = wret;
			break;
5913
		}
Y
Yan Zheng 已提交
5914

5915
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5916 5917 5918 5919 5920 5921
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5922
	kfree(wc);
Y
Yan Zheng 已提交
5923 5924 5925 5926
	btrfs_free_path(path);
	return ret;
}

5927 5928
/*
 * helper to account the unused space of all the readonly block group in the
5929
 * space_info. takes mirrors into account.
5930
 */
5931
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5932
{
5933
	struct btrfs_block_group *block_group;
5934 5935 5936
	u64 free_bytes = 0;
	int factor;

5937
	/* It's df, we don't care if it's racy */
5938 5939 5940 5941 5942
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5943 5944 5945 5946 5947 5948 5949
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5950
		factor = btrfs_bg_type_to_factor(block_group->flags);
5951
		free_bytes += (block_group->length -
5952
			       block_group->used) * factor;
5953 5954 5955 5956 5957 5958 5959 5960

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5961 5962
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5963
{
5964
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5965 5966
}

5967 5968 5969 5970 5971 5972 5973 5974 5975
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5976 5977
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5978 5979 5980 5981 5982
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5983 5984 5985
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5986
 */
5987
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5988
{
5989
	u64 start = SZ_1M, len = 0, end = 0;
5990 5991 5992 5993
	int ret;

	*trimmed = 0;

5994 5995 5996 5997
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5998
	/* Not writable = nothing to do. */
5999
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
6000 6001 6002 6003 6004 6005 6006 6007 6008
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
6009
		struct btrfs_fs_info *fs_info = device->fs_info;
6010 6011 6012 6013
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
6014
			break;
6015

6016 6017 6018
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
6019

6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032
		/* Check if there are any CHUNK_* bits left */
		if (start > device->total_bytes) {
			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
			btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
					  start, end - start + 1,
					  rcu_str_deref(device->name),
					  device->total_bytes);
			mutex_unlock(&fs_info->chunk_mutex);
			ret = 0;
			break;
		}

6033 6034 6035
		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

6036 6037 6038 6039 6040 6041
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
6042

6043
		len = end - start + 1;
6044

6045 6046
		/* We didn't find any extents */
		if (!len) {
6047
			mutex_unlock(&fs_info->chunk_mutex);
6048
			ret = 0;
6049 6050 6051
			break;
		}

6052 6053 6054 6055 6056 6057
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

6077 6078 6079 6080 6081 6082 6083 6084 6085
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
6086
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
6087
{
6088
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6089
	struct btrfs_block_group *cache = NULL;
6090
	struct btrfs_device *device;
6091
	u64 group_trimmed;
6092
	u64 range_end = U64_MAX;
6093 6094 6095
	u64 start;
	u64 end;
	u64 trimmed = 0;
6096 6097 6098 6099
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
6100 6101
	int ret = 0;

6102 6103 6104
	if (range->start == U64_MAX)
		return -EINVAL;

6105 6106 6107 6108 6109 6110 6111 6112
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

6113
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
6114
	for (; cache; cache = btrfs_next_block_group(cache)) {
6115
		if (cache->start >= range_end) {
6116 6117 6118 6119
			btrfs_put_block_group(cache);
			break;
		}

6120 6121
		start = max(range->start, cache->start);
		end = min(range_end, cache->start + cache->length);
6122 6123

		if (end - start >= range->minlen) {
6124
			if (!btrfs_block_group_done(cache)) {
6125
				ret = btrfs_cache_block_group(cache, 0);
6126
				if (ret) {
6127 6128 6129
					bg_failed++;
					bg_ret = ret;
					continue;
6130
				}
6131
				ret = btrfs_wait_block_group_cache_done(cache);
6132
				if (ret) {
6133 6134 6135
					bg_failed++;
					bg_ret = ret;
					continue;
6136
				}
6137 6138 6139 6140 6141 6142 6143 6144 6145
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
6146 6147 6148
				bg_failed++;
				bg_ret = ret;
				continue;
6149 6150 6151 6152
			}
		}
	}

6153 6154 6155 6156
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
6157 6158 6159

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
6160 6161 6162
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
			continue;

6163
		ret = btrfs_trim_free_extents(device, &group_trimmed);
6164 6165 6166
		if (ret) {
			dev_failed++;
			dev_ret = ret;
6167
			break;
6168
		}
6169 6170 6171

		trimmed += group_trimmed;
	}
6172
	mutex_unlock(&fs_devices->device_list_mutex);
6173

6174 6175 6176 6177
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
6178
	range->len = trimmed;
6179 6180 6181
	if (bg_ret)
		return bg_ret;
	return dev_ret;
6182
}