extent-tree.c 163.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "misc.h"
20
#include "tree-log.h"
21 22
#include "disk-io.h"
#include "print-tree.h"
23
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "locking.h"
26
#include "free-space-cache.h"
27
#include "free-space-tree.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35
#include "discard.h"
36
#include "rcu-string.h"
37
#include "zoned.h"
38
#include "dev-replace.h"
39

40 41
#undef SCRAMBLE_DELAYED_REFS

42

43
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
44 45 46 47
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
48 49 50 51 52 53 54 55
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56
				     struct btrfs_delayed_ref_node *node,
57
				     struct btrfs_delayed_extent_op *extent_op);
58 59
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
60

61
static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
J
Josef Bacik 已提交
62 63 64 65
{
	return (cache->flags & bits) == bits;
}

66 67
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
68
{
69
	u64 end = start + num_bytes - 1;
70 71
	set_extent_bits(&fs_info->excluded_extents, start, end,
			EXTENT_UPTODATE);
72 73
	return 0;
}
J
Josef Bacik 已提交
74

75
void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
76
{
77
	struct btrfs_fs_info *fs_info = cache->fs_info;
78
	u64 start, end;
J
Josef Bacik 已提交
79

80 81
	start = cache->start;
	end = start + cache->length - 1;
82

83 84
	clear_extent_bits(&fs_info->excluded_extents, start, end,
			  EXTENT_UPTODATE);
J
Josef Bacik 已提交
85 86
}

87
/* simple helper to search for an existing data extent at a given offset */
88
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
89 90 91
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
92
	struct btrfs_path *path;
93

Z
Zheng Yan 已提交
94
	path = btrfs_alloc_path();
95 96 97
	if (!path)
		return -ENOMEM;

98 99
	key.objectid = start;
	key.offset = len;
100
	key.type = BTRFS_EXTENT_ITEM_KEY;
101
	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
Z
Zheng Yan 已提交
102
	btrfs_free_path(path);
103 104 105
	return ret;
}

106
/*
107
 * helper function to lookup reference count and flags of a tree block.
108 109 110 111 112 113 114 115
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
116
			     struct btrfs_fs_info *fs_info, u64 bytenr,
117
			     u64 offset, int metadata, u64 *refs, u64 *flags)
118 119 120 121 122 123 124 125 126 127 128 129
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

130 131 132 133
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
134 135
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
136 137 138
		metadata = 0;
	}

139 140 141 142 143 144 145 146
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
147 148 149 150 151 152 153 154 155

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

156
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
157 158 159
	if (ret < 0)
		goto out_free;

160
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
161 162 163 164 165 166
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
167
			    key.offset == fs_info->nodesize)
168 169
				ret = 0;
		}
170 171
	}

172 173 174 175 176 177 178 179 180
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
181 182 183 184 185 186 187 188
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
189
		}
190

191 192 193 194 195 196 197 198 199 200 201 202
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
203
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
204 205
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
206
			refcount_inc(&head->refs);
207 208
			spin_unlock(&delayed_refs->lock);

209
			btrfs_release_path(path);
210

211 212 213 214
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
215 216
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
217
			btrfs_put_delayed_ref_head(head);
218
			goto search_again;
219
		}
220
		spin_lock(&head->lock);
221 222 223 224 225
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

226
		num_refs += head->ref_mod;
227
		spin_unlock(&head->lock);
228 229 230 231 232 233 234 235 236 237 238 239 240 241
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
274
 * When a tree block is COWed through a tree, there are four cases:
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
301 302 303
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
304
 * - different files inside a single subvolume
305 306
 * - different offsets inside a file (bookend extents in file.c)
 *
307
 * The extent ref structure for the implicit back refs has fields for:
308 309 310
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
311 312
 * - original offset in the file
 * - how many bookend extents
313
 *
314 315
 * The key offset for the implicit back refs is hash of the first
 * three fields.
316
 *
317
 * The extent ref structure for the full back refs has field for:
318
 *
319
 * - number of pointers in the tree leaf
320
 *
321 322
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
323
 *
324 325
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
326
 *
327
 *     (root_key.objectid, inode objectid, offset in file, 1)
328
 *
329 330
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
331
 *
332
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
333
 *
334
 * Btree extents can be referenced by:
335
 *
336
 * - Different subvolumes
337
 *
338 339 340 341
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
342
 *
343 344 345
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
346
 */
Z
Zheng Yan 已提交
347

348 349
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
350
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
351 352 353 354 355 356 357
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
358
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
359 360 361 362 363 364

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
365
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
366
				return type;
367 368 369
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
370 371
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
372 373
				 */
				if (offset &&
374
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
375 376
					return type;
			}
377
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
378
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
379
				return type;
380 381 382
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
383 384
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
385 386
				 */
				if (offset &&
387
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
388 389
					return type;
			}
390 391 392 393 394 395 396
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
397 398 399
	btrfs_err(eb->fs_info,
		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
		  eb->start, (unsigned long)iref, type);
400 401 402 403 404
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

405
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
406 407 408 409 410 411
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
412
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
413
	lenum = cpu_to_le64(owner);
414
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
415
	lenum = cpu_to_le64(offset);
416
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
446
	struct btrfs_root *root = trans->fs_info->extent_root;
447 448
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
449
	struct extent_buffer *leaf;
450
	u32 nritems;
451
	int ret;
452 453
	int recow;
	int err = -ENOENT;
454

Z
Zheng Yan 已提交
455
	key.objectid = bytenr;
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
471

472 473 474 475
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
476 477 478
	}

	leaf = path->nodes[0];
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
504
				btrfs_release_path(path);
505 506 507 508 509 510
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
511
	}
512 513
fail:
	return err;
Z
Zheng Yan 已提交
514 515
}

516 517 518 519 520
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
521
{
522
	struct btrfs_root *root = trans->fs_info->extent_root;
Z
Zheng Yan 已提交
523 524
	struct btrfs_key key;
	struct extent_buffer *leaf;
525
	u32 size;
Z
Zheng Yan 已提交
526 527
	u32 num_refs;
	int ret;
528 529

	key.objectid = bytenr;
530 531 532 533 534 535 536 537 538 539
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
540

541 542 543 544 545 546 547
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
548
		ref = btrfs_item_ptr(leaf, path->slots[0],
549 550 551 552 553 554 555
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
556
		}
557 558 559 560 561 562 563 564
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
565
			btrfs_release_path(path);
566 567 568 569 570
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
571

572 573 574 575 576 577 578 579 580 581 582 583 584 585
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
586 587
		}
	}
588 589 590
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
591
	btrfs_release_path(path);
592
	return ret;
593 594
}

595 596
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
597
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
598
{
599 600 601
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
602
	struct extent_buffer *leaf;
603
	u32 num_refs = 0;
Z
Zheng Yan 已提交
604 605 606
	int ret = 0;

	leaf = path->nodes[0];
607 608 609 610 611 612 613 614 615 616
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
617
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
618 619 620
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
621 622 623 624
	} else {
		BUG();
	}

625 626
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
627

Z
Zheng Yan 已提交
628
	if (num_refs == 0) {
629
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
J
Josef Bacik 已提交
630
		*last_ref = 1;
Z
Zheng Yan 已提交
631
	} else {
632 633 634 635
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
636 637 638 639 640
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

641
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
642
					  struct btrfs_extent_inline_ref *iref)
643
{
644 645 646 647 648
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
649
	int type;
650 651 652

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
653 654

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
655
	if (iref) {
656 657 658 659 660 661 662
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
682

683 684 685 686
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
687
{
688
	struct btrfs_root *root = trans->fs_info->extent_root;
689
	struct btrfs_key key;
690 691
	int ret;

692 693 694 695 696 697 698
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
699 700
	}

701 702 703 704
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
705 706
}

707 708 709 710
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
711
{
712
	struct btrfs_key key;
Z
Zheng Yan 已提交
713 714
	int ret;

715 716 717 718 719 720 721 722 723
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

724
	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
725
				      path, &key, 0);
726
	btrfs_release_path(path);
Z
Zheng Yan 已提交
727 728 729
	return ret;
}

730
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
731
{
732 733 734 735 736 737 738 739 740 741 742 743 744
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
745
}
746

747 748
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
749

C
Chris Mason 已提交
750
{
751
	for (; level < BTRFS_MAX_LEVEL; level++) {
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
767

768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
789
	struct btrfs_fs_info *fs_info = trans->fs_info;
790
	struct btrfs_root *root = fs_info->extent_root;
791 792 793 794 795 796 797 798 799 800 801 802 803
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
804
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
805
	int needed;
806

807
	key.objectid = bytenr;
Z
Zheng Yan 已提交
808
	key.type = BTRFS_EXTENT_ITEM_KEY;
809
	key.offset = num_bytes;
Z
Zheng Yan 已提交
810

811 812 813
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
814
		path->search_for_extension = 1;
815
		path->keep_locks = 1;
816 817
	} else
		extra_size = -1;
818 819

	/*
820 821
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
822 823 824 825 826 827 828
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
829
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
830
	if (ret < 0) {
831 832 833
		err = ret;
		goto out;
	}
834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
851
			key.objectid = bytenr;
852 853 854 855 856 857 858
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

859 860 861
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
862
	} else if (WARN_ON(ret)) {
863 864
		err = -EIO;
		goto out;
865
	}
866 867 868

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
869
	if (unlikely(item_size < sizeof(*ei))) {
870 871 872 873 874
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
875 876 877 878 879 880 881

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

882
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
883 884 885 886
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

887 888 889 890 891
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

892 893 894 895 896 897 898
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
899 900
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
901
			err = -EUCLEAN;
902 903 904
			goto out;
		}

905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
956 957
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
958
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
959 960 961 962 963 964
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
965
	if (insert) {
966
		path->keep_locks = 0;
967
		path->search_for_extension = 0;
968 969 970 971 972 973 974 975 976
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
977
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
978 979 980 981 982
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1000
	btrfs_extend_item(path, size);
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1045 1046 1047
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1048
	if (ret != -ENOENT)
1049
		return ret;
1050

1051
	btrfs_release_path(path);
1052 1053 1054
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1055 1056
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1057
	} else {
1058 1059
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1060
	}
1061 1062
	return ret;
}
Z
Zheng Yan 已提交
1063

1064 1065 1066 1067
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1068
void update_inline_extent_backref(struct btrfs_path *path,
1069 1070
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1071 1072
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1073
{
1074
	struct extent_buffer *leaf = path->nodes[0];
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1093 1094 1095 1096 1097 1098
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1109
	}
Z
Zheng Yan 已提交
1110

1111 1112 1113 1114 1115 1116 1117 1118 1119
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1120
		*last_ref = 1;
1121 1122 1123 1124 1125 1126 1127 1128
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1129
		btrfs_truncate_item(path, item_size, 1);
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1145 1146 1147
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1148
	if (ret == 0) {
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
		/*
		 * We're adding refs to a tree block we already own, this
		 * should not happen at all.
		 */
		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
			btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
				   bytenr, num_bytes, root_objectid);
			if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
				WARN_ON(1);
				btrfs_crit(trans->fs_info,
			"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
				btrfs_print_leaf(path->nodes[0]);
			}
			return -EUCLEAN;
		}
1165 1166
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1167
	} else if (ret == -ENOENT) {
1168
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1169 1170 1171
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1172
	}
1173 1174
	return ret;
}
Z
Zheng Yan 已提交
1175

1176 1177 1178
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1179
				 int refs_to_drop, int is_data, int *last_ref)
1180
{
1181
	int ret = 0;
1182

1183 1184
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1185 1186
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1187
	} else if (is_data) {
1188
		ret = remove_extent_data_ref(trans, path, refs_to_drop,
J
Josef Bacik 已提交
1189
					     last_ref);
1190
	} else {
J
Josef Bacik 已提交
1191
		*last_ref = 1;
1192
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1193 1194 1195 1196
	}
	return ret;
}

1197 1198
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1199
{
1200 1201
	int j, ret = 0;
	u64 bytes_left, end;
1202
	u64 aligned_start = ALIGN(start, 1 << 9);
1203

1204 1205 1206 1207 1208
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1209

1210
	*discarded_bytes = 0;
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1262 1263
					   GFP_NOFS, 0);
		if (!ret)
1264
			*discarded_bytes += bytes_left;
1265
	}
1266
	return ret;
1267 1268
}

1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
{
	struct btrfs_device *dev = stripe->dev;
	struct btrfs_fs_info *fs_info = dev->fs_info;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	u64 phys = stripe->physical;
	u64 len = stripe->length;
	u64 discarded = 0;
	int ret = 0;

	/* Zone reset on a zoned filesystem */
	if (btrfs_can_zone_reset(dev, phys, len)) {
		u64 src_disc;

		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
		if (ret)
			goto out;

		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
		    dev != dev_replace->srcdev)
			goto out;

		src_disc = discarded;

		/* Send to replace target as well */
		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
					      &discarded);
		discarded += src_disc;
	} else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
	} else {
		ret = 0;
		*bytes = 0;
	}

out:
	*bytes = discarded;
	return ret;
}

1309
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1310
			 u64 num_bytes, u64 *actual_bytes)
1311
{
1312
	int ret = 0;
1313
	u64 discarded_bytes = 0;
1314 1315
	u64 end = bytenr + num_bytes;
	u64 cur = bytenr;
1316
	struct btrfs_bio *bbio = NULL;
1317

C
Christoph Hellwig 已提交
1318

1319 1320 1321 1322
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
1323
	btrfs_bio_counter_inc_blocked(fs_info);
1324 1325
	while (cur < end) {
		struct btrfs_bio_stripe *stripe;
1326 1327
		int i;

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
		num_bytes = end - cur;
		/* Tell the block device(s) that the sectors can be discarded */
		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
				      &num_bytes, &bbio, 0);
		/*
		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
		 * thus we can't continue anyway.
		 */
		if (ret < 0)
			goto out;
1339

1340
		stripe = bbio->stripes;
1341
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1342
			u64 bytes;
1343
			struct btrfs_device *device = stripe->dev;
1344

1345
			if (!device->bdev) {
1346 1347 1348
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1349

1350 1351 1352
			if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
				continue;

1353
			ret = do_discard_extent(stripe, &bytes);
1354
			if (!ret) {
1355
				discarded_bytes += bytes;
1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
			} else if (ret != -EOPNOTSUPP) {
				/*
				 * Logic errors or -ENOMEM, or -EIO, but
				 * unlikely to happen.
				 *
				 * And since there are two loops, explicitly
				 * go to out to avoid confusion.
				 */
				btrfs_put_bbio(bbio);
				goto out;
			}
1367 1368 1369 1370 1371 1372 1373

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1374
		}
1375
		btrfs_put_bbio(bbio);
1376
		cur += num_bytes;
1377
	}
1378
out:
1379
	btrfs_bio_counter_dec(fs_info);
1380 1381 1382 1383

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1384

D
David Woodhouse 已提交
1385 1386
	if (ret == -EOPNOTSUPP)
		ret = 0;
1387 1388 1389
	return ret;
}

1390
/* Can return -ENOMEM */
1391
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1392
			 struct btrfs_ref *generic_ref)
1393
{
1394
	struct btrfs_fs_info *fs_info = trans->fs_info;
1395
	int ret;
A
Arne Jansen 已提交
1396

1397 1398 1399 1400
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1401

1402
	if (generic_ref->type == BTRFS_REF_METADATA)
1403
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1404
	else
1405
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1406

1407
	btrfs_ref_tree_mod(fs_info, generic_ref);
1408

1409 1410 1411
	return ret;
}

1412 1413 1414
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
1415 1416 1417
 * The counterpart is in __btrfs_free_extent(), with examples and more details
 * how it works.
 *
1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1449
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1450
				  struct btrfs_delayed_ref_node *node,
1451 1452 1453 1454 1455 1456 1457
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1458
	struct btrfs_key key;
1459 1460
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1461 1462 1463 1464 1465 1466 1467 1468
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* this will setup the path even if it fails to insert the back ref */
1469 1470 1471
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1472
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1473
		goto out;
J
Josef Bacik 已提交
1474 1475 1476 1477 1478 1479

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1480
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1481
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1482 1483 1484 1485 1486
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1487

1488
	btrfs_mark_buffer_dirty(leaf);
1489
	btrfs_release_path(path);
1490 1491

	/* now insert the actual backref */
1492 1493 1494 1495 1496 1497 1498 1499 1500
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
	} else {
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
	}
1501
	if (ret)
1502
		btrfs_abort_transaction(trans, ret);
1503
out:
1504
	btrfs_free_path(path);
1505
	return ret;
1506 1507
}

1508 1509 1510 1511
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1512
{
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1525
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1526

1527 1528
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1529
	ref_root = ref->root;
1530 1531

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1532
		if (extent_op)
1533
			flags |= extent_op->flags_to_set;
1534 1535 1536 1537
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1538
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1539 1540 1541
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1542
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1543
		ret = __btrfs_free_extent(trans, node, parent,
1544 1545
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1546
					  extent_op);
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1572
				 struct btrfs_delayed_ref_head *head,
1573 1574
				 struct btrfs_delayed_extent_op *extent_op)
{
1575
	struct btrfs_fs_info *fs_info = trans->fs_info;
1576 1577 1578 1579 1580
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1581
	int ret;
1582
	int err = 0;
1583
	int metadata = !extent_op->is_data;
1584

1585
	if (TRANS_ABORTED(trans))
1586 1587
		return 0;

1588
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1589 1590
		metadata = 0;

1591 1592 1593 1594
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1595
	key.objectid = head->bytenr;
1596

1597 1598
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1599
		key.offset = extent_op->level;
1600 1601
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1602
		key.offset = head->num_bytes;
1603 1604 1605
	}

again:
1606
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1607 1608 1609 1610 1611
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1612
		if (metadata) {
1613 1614 1615 1616
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1617
				if (key.objectid == head->bytenr &&
1618
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1619
				    key.offset == head->num_bytes)
1620 1621 1622 1623 1624
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1625

1626 1627
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1628 1629 1630 1631 1632 1633
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1634
		}
1635 1636 1637 1638
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1639

1640
	if (unlikely(item_size < sizeof(*ei))) {
1641 1642 1643 1644 1645 1646
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1647 1648
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1649

1650 1651 1652 1653
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1654 1655
}

1656 1657 1658 1659
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1660 1661
{
	int ret = 0;
1662 1663 1664
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1665

1666
	ref = btrfs_delayed_node_to_tree_ref(node);
1667
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1668

1669 1670
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1671
	ref_root = ref->root;
1672

1673
	if (node->ref_mod != 1) {
1674
		btrfs_err(trans->fs_info,
1675 1676 1677 1678 1679
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1680
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1681
		BUG_ON(!extent_op || !extent_op->update_flags);
1682
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1683
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1684 1685
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1686
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1687
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1688
					  ref->level, 0, 1, extent_op);
1689 1690 1691
	} else {
		BUG();
	}
1692 1693 1694 1695
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1696 1697 1698 1699
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1700
{
1701 1702
	int ret = 0;

1703
	if (TRANS_ABORTED(trans)) {
1704
		if (insert_reserved)
1705
			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1706
		return 0;
1707
	}
1708

1709 1710
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1711
		ret = run_delayed_tree_ref(trans, node, extent_op,
1712 1713 1714
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1715
		ret = run_delayed_data_ref(trans, node, extent_op,
1716 1717 1718
					   insert_reserved);
	else
		BUG();
1719
	if (ret && insert_reserved)
1720
		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1721
	return ret;
1722 1723
}

1724
static inline struct btrfs_delayed_ref_node *
1725 1726
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1727 1728
	struct btrfs_delayed_ref_node *ref;

1729
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1730
		return NULL;
1731

1732 1733 1734 1735 1736 1737
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1738 1739 1740 1741
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1742
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1743
		       struct btrfs_delayed_ref_node, ref_node);
1744 1745
	ASSERT(list_empty(&ref->add_list));
	return ref;
1746 1747
}

1748 1749 1750 1751 1752 1753 1754 1755 1756 1757
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1758 1759
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1760 1761 1762 1763
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1764 1765
		return NULL;

1766
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1767
		head->extent_op = NULL;
1768
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1769
		return NULL;
1770
	}
J
Josef Bacik 已提交
1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1784
	spin_unlock(&head->lock);
1785
	ret = run_delayed_extent_op(trans, head, extent_op);
1786 1787 1788 1789
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1790 1791 1792
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1793
{
J
Josef Bacik 已提交
1794
	int nr_items = 1;	/* Dropping this ref head update. */
1795

1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814
	/*
	 * We had csum deletions accounted for in our delayed refs rsv, we need
	 * to drop the csum leaves for this update from our delayed_refs_rsv.
	 */
	if (head->total_ref_mod < 0 && head->is_data) {
		spin_lock(&delayed_refs->lock);
		delayed_refs->pending_csums -= head->num_bytes;
		spin_unlock(&delayed_refs->lock);
		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
	}

	/*
	 * We were dropping refs, or had a new ref and dropped it, and thus must
	 * adjust down our total_bytes_pinned, the space may or may not have
	 * been pinned and so is accounted for properly in the pinned space by
	 * now.
	 */
	if (head->total_ref_mod < 0 ||
	    (head->total_ref_mod == 0 && head->must_insert_reserved)) {
1815
		u64 flags = btrfs_ref_head_to_space_flags(head);
1816

1817
		btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
1818 1819
	}

J
Josef Bacik 已提交
1820
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1821 1822
}

1823 1824 1825
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1826 1827

	struct btrfs_fs_info *fs_info = trans->fs_info;
1828 1829 1830 1831 1832
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1833
	ret = run_and_cleanup_extent_op(trans, head);
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1849
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1850 1851 1852 1853
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1854
	btrfs_delete_ref_head(delayed_refs, head);
1855
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1856
	spin_unlock(&delayed_refs->lock);
1857 1858

	if (head->must_insert_reserved) {
1859
		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1860
		if (head->is_data) {
1861 1862
			ret = btrfs_del_csums(trans, fs_info->csum_root,
					      head->bytenr, head->num_bytes);
1863 1864 1865
		}
	}

1866
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1867 1868

	trace_run_delayed_ref_head(fs_info, head, 0);
1869
	btrfs_delayed_ref_unlock(head);
1870
	btrfs_put_delayed_ref_head(head);
1871
	return ret;
1872 1873
}

1874 1875 1876 1877 1878 1879 1880 1881 1882
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1883
	head = btrfs_select_ref_head(delayed_refs);
1884 1885 1886 1887 1888 1889 1890 1891 1892
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1893
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1920 1921 1922
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1987 1988 1989 1990
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
1991 1992
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
1993
{
1994
	struct btrfs_fs_info *fs_info = trans->fs_info;
1995 1996
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
1997
	ktime_t start = ktime_get();
1998
	int ret;
1999
	unsigned long count = 0;
2000
	unsigned long actual_count = 0;
2001 2002

	delayed_refs = &trans->transaction->delayed_refs;
2003
	do {
2004
		if (!locked_ref) {
2005
			locked_ref = btrfs_obtain_ref_head(trans);
2006 2007 2008 2009 2010 2011
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2012
			}
2013
			count++;
2014
		}
2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2027
		spin_lock(&locked_ref->lock);
2028
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2029

2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2043
			ret = cleanup_ref_head(trans, locked_ref);
2044
			if (ret > 0 ) {
2045 2046
				/* We dropped our lock, we need to loop. */
				ret = 0;
2047
				continue;
2048 2049
			} else if (ret) {
				return ret;
2050
			}
2051
		}
2052

2053
		/*
2054 2055
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2056 2057
		 */

2058
		locked_ref = NULL;
2059
		cond_resched();
2060
	} while ((nr != -1 && count < nr) || locked_ref);
2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2077
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2078 2079
		spin_unlock(&delayed_refs->lock);
	}
2080
	return 0;
2081 2082
}

2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2126 2127 2128 2129 2130 2131
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2132 2133 2134
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2135 2136
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2137
			   unsigned long count)
2138
{
2139
	struct btrfs_fs_info *fs_info = trans->fs_info;
2140 2141
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2142
	struct btrfs_delayed_ref_head *head;
2143 2144 2145
	int ret;
	int run_all = count == (unsigned long)-1;

2146
	/* We'll clean this up in btrfs_cleanup_transaction */
2147
	if (TRANS_ABORTED(trans))
2148 2149
		return 0;

2150
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2151 2152
		return 0;

2153
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2154
	if (count == 0)
2155
		count = delayed_refs->num_heads_ready;
2156

2157
again:
2158 2159 2160
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2161
	ret = __btrfs_run_delayed_refs(trans, count);
2162
	if (ret < 0) {
2163
		btrfs_abort_transaction(trans, ret);
2164
		return ret;
2165
	}
2166

2167
	if (run_all) {
2168
		btrfs_create_pending_block_groups(trans);
2169

2170
		spin_lock(&delayed_refs->lock);
2171
		node = rb_first_cached(&delayed_refs->href_root);
2172 2173
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2174
			goto out;
2175
		}
2176 2177 2178 2179
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2180

2181 2182 2183
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2184

2185
		btrfs_put_delayed_ref_head(head);
2186
		cond_resched();
2187
		goto again;
2188
	}
2189
out:
2190 2191 2192
	return 0;
}

2193
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2194
				struct extent_buffer *eb, u64 flags,
2195
				int level, int is_data)
2196 2197 2198 2199
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2200
	extent_op = btrfs_alloc_delayed_extent_op();
2201 2202 2203 2204
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2205 2206 2207
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2208
	extent_op->level = level;
2209

2210
	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2211
	if (ret)
2212
		btrfs_free_delayed_extent_op(extent_op);
2213 2214 2215
	return ret;
}

2216
static noinline int check_delayed_ref(struct btrfs_root *root,
2217 2218 2219 2220 2221 2222 2223
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2224
	struct btrfs_transaction *cur_trans;
2225
	struct rb_node *node;
2226 2227
	int ret = 0;

2228
	spin_lock(&root->fs_info->trans_lock);
2229
	cur_trans = root->fs_info->running_transaction;
2230 2231 2232
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2233 2234 2235 2236
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2237
	spin_lock(&delayed_refs->lock);
2238
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2239 2240
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2241
		btrfs_put_transaction(cur_trans);
2242 2243
		return 0;
	}
2244 2245

	if (!mutex_trylock(&head->mutex)) {
2246
		refcount_inc(&head->refs);
2247 2248
		spin_unlock(&delayed_refs->lock);

2249
		btrfs_release_path(path);
2250

2251 2252 2253 2254
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2255 2256
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2257
		btrfs_put_delayed_ref_head(head);
2258
		btrfs_put_transaction(cur_trans);
2259 2260
		return -EAGAIN;
	}
2261
	spin_unlock(&delayed_refs->lock);
2262

2263
	spin_lock(&head->lock);
2264 2265 2266 2267
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2268 2269
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2270
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2271 2272 2273 2274 2275
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2276

2277
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2278

2279 2280 2281 2282 2283 2284 2285 2286 2287 2288
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2289
	}
2290
	spin_unlock(&head->lock);
2291
	mutex_unlock(&head->mutex);
2292
	btrfs_put_transaction(cur_trans);
2293 2294 2295
	return ret;
}

2296
static noinline int check_committed_ref(struct btrfs_root *root,
2297
					struct btrfs_path *path,
2298 2299
					u64 objectid, u64 offset, u64 bytenr,
					bool strict)
2300
{
2301 2302
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
2303
	struct extent_buffer *leaf;
2304 2305 2306
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2307
	struct btrfs_key key;
2308
	u32 item_size;
2309
	int type;
2310
	int ret;
2311

2312
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2313
	key.offset = (u64)-1;
2314
	key.type = BTRFS_EXTENT_ITEM_KEY;
2315 2316 2317 2318

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2319
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2320 2321 2322

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2323
		goto out;
2324

Z
Zheng Yan 已提交
2325
	path->slots[0]--;
2326
	leaf = path->nodes[0];
2327
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2328

2329
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2330
		goto out;
2331

2332 2333 2334
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2335

2336
	/* If extent item has more than 1 inline ref then it's shared */
2337 2338 2339
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2340

2341 2342 2343 2344 2345 2346 2347
	/*
	 * If extent created before last snapshot => it's shared unless the
	 * snapshot has been deleted. Use the heuristic if strict is false.
	 */
	if (!strict &&
	    (btrfs_extent_generation(leaf, ei) <=
	     btrfs_root_last_snapshot(&root->root_item)))
2348 2349 2350
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2351

2352
	/* If this extent has SHARED_DATA_REF then it's shared */
2353 2354
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2371
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2372
			  u64 bytenr, bool strict)
2373 2374 2375 2376 2377 2378
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2379
		return -ENOMEM;
2380 2381

	do {
2382
		ret = check_committed_ref(root, path, objectid,
2383
					  offset, bytenr, strict);
2384
		if (ret && ret != -ENOENT)
2385
			goto out;
Y
Yan Zheng 已提交
2386

2387 2388
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2389

2390
out:
Y
Yan Zheng 已提交
2391
	btrfs_free_path(path);
2392 2393
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
2394
	return ret;
2395
}
C
Chris Mason 已提交
2396

2397
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2398
			   struct btrfs_root *root,
2399
			   struct extent_buffer *buf,
2400
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2401
{
2402
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2403
	u64 bytenr;
2404 2405
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2406 2407 2408 2409
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2410 2411
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2412
	int i;
2413
	int action;
Z
Zheng Yan 已提交
2414 2415
	int level;
	int ret = 0;
2416

2417
	if (btrfs_is_testing(fs_info))
2418
		return 0;
2419

Z
Zheng Yan 已提交
2420 2421 2422 2423
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2424
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2425
		return 0;
Z
Zheng Yan 已提交
2426

2427 2428 2429 2430
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2431 2432 2433 2434
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2435 2436

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2437
		if (level == 0) {
2438
			btrfs_item_key_to_cpu(buf, &key, i);
2439
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2440
				continue;
2441
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2442 2443 2444 2445 2446 2447 2448
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2449 2450 2451

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2452 2453 2454 2455 2456 2457
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
					    key.offset);
			generic_ref.skip_qgroup = for_reloc;
2458
			if (inc)
2459
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2460
			else
2461
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2462 2463 2464
			if (ret)
				goto fail;
		} else {
2465
			bytenr = btrfs_node_blockptr(buf, i);
2466
			num_bytes = fs_info->nodesize;
2467 2468 2469 2470 2471
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
			generic_ref.skip_qgroup = for_reloc;
2472
			if (inc)
2473
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2474
			else
2475
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2476 2477 2478 2479 2480 2481
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2482 2483 2484 2485
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2486
		  struct extent_buffer *buf, int full_backref)
2487
{
2488
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2489 2490 2491
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2492
		  struct extent_buffer *buf, int full_backref)
2493
{
2494
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2495 2496
}

2497
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2498
{
2499
	struct btrfs_fs_info *fs_info = root->fs_info;
2500
	u64 flags;
D
David Woodhouse 已提交
2501
	u64 ret;
J
Josef Bacik 已提交
2502

2503 2504
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2505
	else if (root == fs_info->chunk_root)
2506
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2507
	else
2508
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2509

2510
	ret = btrfs_get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2511
	return ret;
J
Josef Bacik 已提交
2512
}
J
Josef Bacik 已提交
2513

2514
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2515
{
2516
	struct btrfs_block_group *cache;
2517
	u64 bytenr;
J
Josef Bacik 已提交
2518

2519 2520 2521
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2522 2523 2524 2525

	if (bytenr < (u64)-1)
		return bytenr;

2526
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2527
	if (!cache)
2528
		return 0;
J
Josef Bacik 已提交
2529

2530
	bytenr = cache->start;
2531
	btrfs_put_block_group(cache);
2532 2533

	return bytenr;
2534 2535
}

2536 2537
static int pin_down_extent(struct btrfs_trans_handle *trans,
			   struct btrfs_block_group *cache,
2538
			   u64 bytenr, u64 num_bytes, int reserved)
2539
{
2540 2541
	struct btrfs_fs_info *fs_info = cache->fs_info;

2542 2543 2544
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2545 2546
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2547 2548 2549 2550 2551 2552
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2553

2554
	__btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
2555
	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2556 2557 2558
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2559

2560
int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2561 2562
		     u64 bytenr, u64 num_bytes, int reserved)
{
2563
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
2564

2565
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2566
	BUG_ON(!cache); /* Logic error */
2567

2568
	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2569 2570

	btrfs_put_block_group(cache);
2571 2572 2573
	return 0;
}

2574
/*
2575 2576
 * this function must be called within transaction
 */
2577
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2578 2579
				    u64 bytenr, u64 num_bytes)
{
2580
	struct btrfs_block_group *cache;
2581
	int ret;
2582

2583
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2584 2585
	if (!cache)
		return -EINVAL;
2586 2587 2588 2589 2590 2591 2592

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2593
	btrfs_cache_block_group(cache, 1);
2594 2595 2596 2597 2598 2599 2600
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(cache);
	if (ret)
		goto out;
2601

2602
	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2603 2604

	/* remove us from the free space cache (if we're there at all) */
2605
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2606
out:
2607
	btrfs_put_block_group(cache);
2608
	return ret;
2609 2610
}

2611 2612
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2613 2614
{
	int ret;
2615
	struct btrfs_block_group *block_group;
2616

2617
	block_group = btrfs_lookup_block_group(fs_info, start);
2618 2619 2620
	if (!block_group)
		return -EINVAL;

2621 2622 2623 2624 2625 2626 2627 2628
	btrfs_cache_block_group(block_group, 1);
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(block_group);
	if (ret)
		goto out;
2629

2630 2631
	ret = btrfs_remove_free_space(block_group, start, num_bytes);
out:
2632 2633 2634 2635
	btrfs_put_block_group(block_group);
	return ret;
}

2636
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2637
{
2638
	struct btrfs_fs_info *fs_info = eb->fs_info;
2639 2640 2641 2642
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2643
	int ret = 0;
2644

2645
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2660 2661 2662
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2663 2664
	}

2665
	return ret;
2666 2667
}

2668
static void
2669
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2670 2671 2672 2673
{
	atomic_inc(&bg->reservations);
}

2674 2675 2676 2677 2678
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2679 2680
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2681 2682 2683 2684 2685 2686 2687 2688
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2689
		ret = &fs_info->meta_alloc_cluster;
2690 2691 2692
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2693
			*empty_cluster = SZ_64K;
2694 2695 2696
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2697
		ret = &fs_info->data_alloc_cluster;
2698 2699 2700 2701 2702
	}

	return ret;
}

2703 2704
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2705
			      const bool return_free_space)
C
Chris Mason 已提交
2706
{
2707
	struct btrfs_block_group *cache = NULL;
2708 2709
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2710
	struct btrfs_free_cluster *cluster = NULL;
2711
	u64 len;
2712 2713
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2714
	bool readonly;
C
Chris Mason 已提交
2715

2716
	while (start <= end) {
2717
		readonly = false;
2718
		if (!cache ||
2719
		    start >= cache->start + cache->length) {
2720 2721
			if (cache)
				btrfs_put_block_group(cache);
2722
			total_unpinned = 0;
2723
			cache = btrfs_lookup_block_group(fs_info, start);
2724
			BUG_ON(!cache); /* Logic error */
2725

2726
			cluster = fetch_cluster_info(fs_info,
2727 2728 2729
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2730 2731
		}

2732
		len = cache->start + cache->length - start;
2733 2734
		len = min(len, end + 1 - start);

2735
		down_read(&fs_info->commit_root_sem);
2736 2737 2738 2739
		if (start < cache->last_byte_to_unpin && return_free_space) {
			u64 add_len = min(len, cache->last_byte_to_unpin - start);

			btrfs_add_free_space(cache, start, add_len);
2740
		}
2741
		up_read(&fs_info->commit_root_sem);
2742

2743
		start += len;
2744
		total_unpinned += len;
2745
		space_info = cache->space_info;
2746

2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2760
		spin_lock(&space_info->lock);
2761 2762
		spin_lock(&cache->lock);
		cache->pinned -= len;
2763
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2764
		space_info->max_extent_size = 0;
2765
		__btrfs_mod_total_bytes_pinned(space_info, -len);
2766 2767 2768
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
2769 2770 2771 2772
		} else if (btrfs_is_zoned(fs_info)) {
			/* Need reset before reusing in a zoned block group */
			space_info->bytes_zone_unusable += len;
			readonly = true;
2773
		}
2774
		spin_unlock(&cache->lock);
2775 2776 2777
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2778

2779 2780
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2781 2782 2783
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2784 2785
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2786 2787
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2788
				len -= to_add;
2789 2790 2791
			}
			spin_unlock(&global_rsv->lock);
		}
2792 2793 2794
		/* Add to any tickets we may have */
		if (!readonly && return_free_space && len)
			btrfs_try_granting_tickets(fs_info, space_info);
2795
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2796
	}
2797 2798 2799

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
2800 2801 2802
	return 0;
}

2803
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2804
{
2805
	struct btrfs_fs_info *fs_info = trans->fs_info;
2806
	struct btrfs_block_group *block_group, *tmp;
2807
	struct list_head *deleted_bgs;
2808
	struct extent_io_tree *unpin;
2809 2810
	u64 start;
	u64 end;
2811 2812
	int ret;

2813
	unpin = &trans->transaction->pinned_extents;
2814

2815
	while (!TRANS_ABORTED(trans)) {
2816 2817
		struct extent_state *cached_state = NULL;

2818
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2819
		ret = find_first_extent_bit(unpin, 0, &start, &end,
2820
					    EXTENT_DIRTY, &cached_state);
2821 2822
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2823
			break;
2824
		}
2825

2826
		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2827
			ret = btrfs_discard_extent(fs_info, start,
2828
						   end + 1 - start, NULL);
2829

2830
		clear_extent_dirty(unpin, start, end, &cached_state);
2831
		unpin_extent_range(fs_info, start, end, true);
2832
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2833
		free_extent_state(cached_state);
2834
		cond_resched();
2835
	}
J
Josef Bacik 已提交
2836

2837 2838
	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2839
		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2840
	}
2841

2842 2843 2844 2845 2846 2847 2848 2849 2850 2851
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
2852
		if (!TRANS_ABORTED(trans))
2853
			ret = btrfs_discard_extent(fs_info,
2854 2855
						   block_group->start,
						   block_group->length,
2856 2857 2858
						   &trimmed);

		list_del_init(&block_group->bg_list);
2859
		btrfs_unfreeze_block_group(block_group);
2860 2861 2862 2863 2864
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
2865
			   "discard failed while removing blockgroup: errno=%d %s",
2866 2867 2868 2869
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
2870 2871 2872
	return 0;
}

2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931
/*
 * Drop one or more refs of @node.
 *
 * 1. Locate the extent refs.
 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
 *    Locate it, then reduce the refs number or remove the ref line completely.
 *
 * 2. Update the refs count in EXTENT/METADATA_ITEM
 *
 * Inline backref case:
 *
 * in extent tree we have:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 2 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *		extent data backref root FS_TREE objectid 257 offset 0 count 1
 *
 * This function gets called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 257
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 1 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *
 * Keyed backref case:
 *
 * in extent tree we have:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 754 gen 6 flags DATA
 *	[...]
 *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
 *		extent data backref root FS_TREE objectid 866 offset 0 count 1
 *
 * This function get called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 866
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 753 gen 6 flags DATA
 *
 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
 */
2932
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2933 2934 2935 2936
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
2937
{
2938
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
2939
	struct btrfs_key key;
2940
	struct btrfs_path *path;
2941
	struct btrfs_root *extent_root = info->extent_root;
2942
	struct extent_buffer *leaf;
2943 2944
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
2945
	int ret;
2946
	int is_data;
2947 2948 2949
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
2950 2951
	u32 item_size;
	u64 refs;
2952 2953
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
2954
	int last_ref = 0;
2955
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
2956

2957
	path = btrfs_alloc_path();
2958 2959
	if (!path)
		return -ENOMEM;
2960

2961
	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2962 2963 2964 2965 2966 2967 2968 2969 2970

	if (!is_data && refs_to_drop != 1) {
		btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
			   node->bytenr, refs_to_drop);
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
2971

2972
	if (is_data)
2973
		skinny_metadata = false;
2974

2975 2976
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
2977
				    owner_offset);
2978
	if (ret == 0) {
2979 2980 2981 2982 2983 2984 2985
		/*
		 * Either the inline backref or the SHARED_DATA_REF/
		 * SHARED_BLOCK_REF is found
		 *
		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
		 */
2986
		extent_slot = path->slots[0];
2987 2988
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
2989
					      extent_slot);
2990
			if (key.objectid != bytenr)
2991
				break;
2992 2993
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
2994 2995 2996
				found_extent = 1;
				break;
			}
2997 2998 2999 3000 3001
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
3002 3003

			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3004 3005
			if (path->slots[0] - extent_slot > 5)
				break;
3006
			extent_slot--;
3007
		}
3008

Z
Zheng Yan 已提交
3009
		if (!found_extent) {
3010 3011 3012 3013 3014 3015 3016
			if (iref) {
				btrfs_crit(info,
"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
			/* Must be SHARED_* item, remove the backref first */
3017
			ret = remove_extent_backref(trans, path, NULL,
3018
						    refs_to_drop,
J
Josef Bacik 已提交
3019
						    is_data, &last_ref);
3020
			if (ret) {
3021
				btrfs_abort_transaction(trans, ret);
3022 3023
				goto out;
			}
3024
			btrfs_release_path(path);
3025

3026
			/* Slow path to locate EXTENT/METADATA_ITEM */
3027 3028 3029 3030
			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3031 3032 3033 3034 3035
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3036 3037
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3054
				key.objectid = bytenr;
3055 3056 3057 3058 3059 3060 3061
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3062
			if (ret) {
J
Jeff Mahoney 已提交
3063 3064 3065
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3066
				if (ret > 0)
3067
					btrfs_print_leaf(path->nodes[0]);
3068
			}
3069
			if (ret < 0) {
3070
				btrfs_abort_transaction(trans, ret);
3071 3072
				goto out;
			}
Z
Zheng Yan 已提交
3073 3074
			extent_slot = path->slots[0];
		}
3075
	} else if (WARN_ON(ret == -ENOENT)) {
3076
		btrfs_print_leaf(path->nodes[0]);
3077 3078
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3079 3080
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3081
		btrfs_abort_transaction(trans, ret);
3082
		goto out;
3083
	} else {
3084
		btrfs_abort_transaction(trans, ret);
3085
		goto out;
3086
	}
3087 3088

	leaf = path->nodes[0];
3089
	item_size = btrfs_item_size_nr(leaf, extent_slot);
3090
	if (unlikely(item_size < sizeof(*ei))) {
3091 3092 3093 3094 3095
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3096
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3097
			    struct btrfs_extent_item);
3098 3099
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3100
		struct btrfs_tree_block_info *bi;
3101 3102
		if (item_size < sizeof(*ei) + sizeof(*bi)) {
			btrfs_crit(info,
3103
"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3104 3105 3106 3107 3108 3109
				   key.objectid, key.type, key.offset,
				   owner_objectid, item_size,
				   sizeof(*ei) + sizeof(*bi));
			btrfs_abort_transaction(trans, -EUCLEAN);
			goto err_dump;
		}
3110 3111 3112
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3113

3114
	refs = btrfs_extent_refs(leaf, ei);
3115
	if (refs < refs_to_drop) {
3116 3117
		btrfs_crit(info,
		"trying to drop %d refs but we only have %llu for bytenr %llu",
J
Jeff Mahoney 已提交
3118
			  refs_to_drop, refs, bytenr);
3119 3120
		btrfs_abort_transaction(trans, -EUCLEAN);
		goto err_dump;
3121
	}
3122
	refs -= refs_to_drop;
3123

3124 3125 3126 3127 3128 3129
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3130
		 */
3131
		if (iref) {
3132 3133 3134 3135 3136 3137
			if (!found_extent) {
				btrfs_crit(info,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3138 3139 3140 3141 3142
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3143 3144 3145
			ret = remove_extent_backref(trans, path, iref,
						    refs_to_drop, is_data,
						    &last_ref);
3146
			if (ret) {
3147
				btrfs_abort_transaction(trans, ret);
3148 3149
				goto out;
			}
3150
		}
3151
	} else {
3152
		/* In this branch refs == 1 */
3153
		if (found_extent) {
3154 3155 3156 3157 3158 3159 3160 3161 3162
			if (is_data && refs_to_drop !=
			    extent_data_ref_count(path, iref)) {
				btrfs_crit(info,
		"invalid refs_to_drop, current refs %u refs_to_drop %u",
					   extent_data_ref_count(path, iref),
					   refs_to_drop);
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3163
			if (iref) {
3164 3165 3166 3167 3168 3169 3170 3171
				if (path->slots[0] != extent_slot) {
					btrfs_crit(info,
"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
						   key.objectid, key.type,
						   key.offset);
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3172
			} else {
3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184
				/*
				 * No inline ref, we must be at SHARED_* item,
				 * And it's single ref, it must be:
				 * |	extent_slot	  ||extent_slot + 1|
				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
				 */
				if (path->slots[0] != extent_slot + 1) {
					btrfs_crit(info,
	"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3185 3186 3187
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3188
		}
3189

J
Josef Bacik 已提交
3190
		last_ref = 1;
3191 3192
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3193
		if (ret) {
3194
			btrfs_abort_transaction(trans, ret);
3195 3196
			goto out;
		}
3197
		btrfs_release_path(path);
3198

3199
		if (is_data) {
3200 3201
			ret = btrfs_del_csums(trans, info->csum_root, bytenr,
					      num_bytes);
3202
			if (ret) {
3203
				btrfs_abort_transaction(trans, ret);
3204 3205
				goto out;
			}
3206 3207
		}

3208
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3209
		if (ret) {
3210
			btrfs_abort_transaction(trans, ret);
3211 3212 3213
			goto out;
		}

3214
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3215
		if (ret) {
3216
			btrfs_abort_transaction(trans, ret);
3217 3218
			goto out;
		}
3219
	}
J
Josef Bacik 已提交
3220 3221
	btrfs_release_path(path);

3222
out:
3223
	btrfs_free_path(path);
3224
	return ret;
3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237
err_dump:
	/*
	 * Leaf dump can take up a lot of log buffer, so we only do full leaf
	 * dump for debug build.
	 */
	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
		btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
			   path->slots[0], extent_slot);
		btrfs_print_leaf(path->nodes[0]);
	}

	btrfs_free_path(path);
	return -EUCLEAN;
3238 3239
}

3240
/*
3241
 * when we free an block, it is possible (and likely) that we free the last
3242 3243 3244 3245 3246
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3247
				      u64 bytenr)
3248 3249 3250
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3251
	int ret = 0;
3252 3253 3254

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3255
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3256
	if (!head)
3257
		goto out_delayed_unlock;
3258

3259
	spin_lock(&head->lock);
3260
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3261 3262
		goto out;

J
Josef Bacik 已提交
3263 3264
	if (cleanup_extent_op(head) != NULL)
		goto out;
3265

3266 3267 3268 3269 3270 3271 3272
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3273
	btrfs_delete_ref_head(delayed_refs, head);
3274
	head->processing = 0;
3275

3276
	spin_unlock(&head->lock);
3277 3278
	spin_unlock(&delayed_refs->lock);

3279 3280 3281 3282
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3283
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3284
	mutex_unlock(&head->mutex);
3285
	btrfs_put_delayed_ref_head(head);
3286
	return ret;
3287
out:
3288
	spin_unlock(&head->lock);
3289 3290

out_delayed_unlock:
3291 3292 3293 3294
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3295 3296 3297
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
3298
			   u64 parent, int last_ref)
3299
{
3300
	struct btrfs_fs_info *fs_info = root->fs_info;
3301
	struct btrfs_ref generic_ref = { 0 };
3302 3303
	int ret;

3304 3305 3306 3307 3308
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
			    root->root_key.objectid);

3309
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3310
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3311
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3312
		BUG_ON(ret); /* -ENOMEM */
3313 3314
	}

3315
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3316
		struct btrfs_block_group *cache;
3317
		bool must_pin = false;
3318

3319
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3320
			ret = check_ref_cleanup(trans, buf->start);
3321 3322
			if (!ret) {
				btrfs_redirty_list_add(trans->transaction, buf);
3323
				goto out;
3324
			}
3325 3326
		}

3327
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3328

3329
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3330
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3331
			btrfs_put_block_group(cache);
3332
			goto out;
3333 3334
		}

3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348
		/*
		 * If this is a leaf and there are tree mod log users, we may
		 * have recorded mod log operations that point to this leaf.
		 * So we must make sure no one reuses this leaf's extent before
		 * mod log operations are applied to a node, otherwise after
		 * rewinding a node using the mod log operations we get an
		 * inconsistent btree, as the leaf's extent may now be used as
		 * a node or leaf for another different btree.
		 * We are safe from races here because at this point no other
		 * node or root points to this extent buffer, so if after this
		 * check a new tree mod log user joins, it will not be able to
		 * find a node pointing to this leaf and record operations that
		 * point to this leaf.
		 */
3349 3350 3351
		if (btrfs_header_level(buf) == 0 &&
		    test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
			must_pin = true;
3352 3353

		if (must_pin || btrfs_is_zoned(fs_info)) {
3354 3355 3356 3357 3358 3359
			btrfs_redirty_list_add(trans->transaction, buf);
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
			btrfs_put_block_group(cache);
			goto out;
		}

3360 3361 3362
		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3363
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3364
		btrfs_put_block_group(cache);
3365
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3366 3367
	}
out:
3368 3369 3370 3371 3372 3373 3374
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3375 3376
}

3377
/* Can return -ENOMEM */
3378
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3379
{
3380
	struct btrfs_fs_info *fs_info = trans->fs_info;
3381 3382
	int ret;

3383
	if (btrfs_is_testing(fs_info))
3384
		return 0;
3385

3386 3387 3388 3389
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3390 3391 3392 3393
	if ((ref->type == BTRFS_REF_METADATA &&
	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	    (ref->type == BTRFS_REF_DATA &&
	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3394
		/* unlocks the pinned mutex */
3395
		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3396
		ret = 0;
3397
	} else if (ref->type == BTRFS_REF_METADATA) {
3398
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3399
	} else {
3400
		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3401
	}
3402

3403 3404 3405 3406 3407
	if (!((ref->type == BTRFS_REF_METADATA &&
	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	      (ref->type == BTRFS_REF_DATA &&
	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
		btrfs_ref_tree_mod(fs_info, ref);
3408

3409 3410 3411
	return ret;
}

J
Josef Bacik 已提交
3412
enum btrfs_loop_type {
3413 3414 3415 3416
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3417 3418
};

3419
static inline void
3420
btrfs_lock_block_group(struct btrfs_block_group *cache,
3421 3422 3423 3424 3425 3426
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3427
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3428 3429 3430 3431 3432 3433 3434
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3435 3436
static struct btrfs_block_group *btrfs_lock_cluster(
		   struct btrfs_block_group *block_group,
3437 3438
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
3439
	__acquires(&cluster->refill_lock)
3440
{
3441
	struct btrfs_block_group *used_bg = NULL;
3442

3443
	spin_lock(&cluster->refill_lock);
3444 3445 3446 3447 3448 3449
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3450 3451
			return used_bg;

3452
		btrfs_get_block_group(used_bg);
3453

3454 3455
		if (!delalloc)
			return used_bg;
3456

3457 3458
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3459

3460
		spin_unlock(&cluster->refill_lock);
3461

3462 3463
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3464

3465 3466 3467
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3468

3469 3470 3471
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3472 3473 3474
}

static inline void
3475
btrfs_release_block_group(struct btrfs_block_group *cache,
3476 3477 3478 3479 3480 3481 3482
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3483 3484
enum btrfs_extent_allocation_policy {
	BTRFS_EXTENT_ALLOC_CLUSTERED,
3485
	BTRFS_EXTENT_ALLOC_ZONED,
3486 3487
};

3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
	u64 num_bytes;
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;
3504 3505
	struct btrfs_free_cluster *last_ptr;
	bool use_cluster;
3506 3507 3508 3509

	bool have_caching_bg;
	bool orig_have_caching_bg;

3510 3511 3512
	/* Allocation is called for tree-log */
	bool for_treelog;

3513 3514 3515
	/* RAID index, converted from flags */
	int index;

3516 3517 3518
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
3544

3545 3546 3547
	/* Hint where to start looking for an empty space */
	u64 hint_byte;

3548 3549
	/* Allocation policy */
	enum btrfs_extent_allocation_policy policy;
3550 3551
};

3552 3553 3554 3555 3556 3557 3558 3559 3560

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
3561
static int find_free_extent_clustered(struct btrfs_block_group *bg,
3562 3563
				      struct find_free_extent_ctl *ffe_ctl,
				      struct btrfs_block_group **cluster_bg_ret)
3564
{
3565
	struct btrfs_block_group *cluster_bg;
3566
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3579
			ffe_ctl->num_bytes, cluster_bg->start,
3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3625 3626
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3646
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3660 3661 3662 3663 3664
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
3665
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3666
					struct find_free_extent_ctl *ffe_ctl)
3667
{
3668
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3713 3714
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3715 3716 3717 3718 3719 3720 3721 3722 3723
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3724 3725 3726 3727 3728 3729 3730 3731
static int do_allocation_clustered(struct btrfs_block_group *block_group,
				   struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_block_group **bg_ret)
{
	int ret;

	/* We want to try and use the cluster allocator, so lets look there */
	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3732
		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3733 3734 3735 3736 3737
		if (ret >= 0 || ret == -EAGAIN)
			return ret;
		/* ret == -ENOENT case falls through */
	}

3738
	return find_free_extent_unclustered(block_group, ffe_ctl);
3739 3740
}

3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756
/*
 * Tree-log block group locking
 * ============================
 *
 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
 * indicates the starting address of a block group, which is reserved only
 * for tree-log metadata.
 *
 * Lock nesting
 * ============
 *
 * space_info::lock
 *   block_group::lock
 *     fs_info::treelog_bg_lock
 */

3757 3758 3759 3760 3761 3762 3763 3764 3765
/*
 * Simple allocator for sequential-only block group. It only allows sequential
 * allocation. No need to play with trees. This function also reserves the
 * bytes as in btrfs_add_reserved_bytes.
 */
static int do_allocation_zoned(struct btrfs_block_group *block_group,
			       struct find_free_extent_ctl *ffe_ctl,
			       struct btrfs_block_group **bg_ret)
{
3766
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3767 3768 3769 3770 3771
	struct btrfs_space_info *space_info = block_group->space_info;
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	u64 start = block_group->start;
	u64 num_bytes = ffe_ctl->num_bytes;
	u64 avail;
3772 3773
	u64 bytenr = block_group->start;
	u64 log_bytenr;
3774
	int ret = 0;
3775
	bool skip;
3776 3777 3778

	ASSERT(btrfs_is_zoned(block_group->fs_info));

3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790
	/*
	 * Do not allow non-tree-log blocks in the dedicated tree-log block
	 * group, and vice versa.
	 */
	spin_lock(&fs_info->treelog_bg_lock);
	log_bytenr = fs_info->treelog_bg;
	skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
			      (!ffe_ctl->for_treelog && bytenr == log_bytenr));
	spin_unlock(&fs_info->treelog_bg_lock);
	if (skip)
		return 1;

3791 3792
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
3793 3794 3795 3796 3797
	spin_lock(&fs_info->treelog_bg_lock);

	ASSERT(!ffe_ctl->for_treelog ||
	       block_group->start == fs_info->treelog_bg ||
	       fs_info->treelog_bg == 0);
3798 3799 3800 3801 3802 3803

	if (block_group->ro) {
		ret = 1;
		goto out;
	}

3804 3805 3806 3807 3808 3809 3810 3811 3812 3813
	/*
	 * Do not allow currently using block group to be tree-log dedicated
	 * block group.
	 */
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
	    (block_group->used || block_group->reserved)) {
		ret = 1;
		goto out;
	}

3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827
	avail = block_group->length - block_group->alloc_offset;
	if (avail < num_bytes) {
		if (ffe_ctl->max_extent_size < avail) {
			/*
			 * With sequential allocator, free space is always
			 * contiguous
			 */
			ffe_ctl->max_extent_size = avail;
			ffe_ctl->total_free_space = avail;
		}
		ret = 1;
		goto out;
	}

3828 3829 3830
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
		fs_info->treelog_bg = block_group->start;

3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844
	ffe_ctl->found_offset = start + block_group->alloc_offset;
	block_group->alloc_offset += num_bytes;
	spin_lock(&ctl->tree_lock);
	ctl->free_space -= num_bytes;
	spin_unlock(&ctl->tree_lock);

	/*
	 * We do not check if found_offset is aligned to stripesize. The
	 * address is anyway rewritten when using zone append writing.
	 */

	ffe_ctl->search_start = ffe_ctl->found_offset;

out:
3845 3846 3847
	if (ret && ffe_ctl->for_treelog)
		fs_info->treelog_bg = 0;
	spin_unlock(&fs_info->treelog_bg_lock);
3848 3849 3850 3851 3852
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);
	return ret;
}

3853 3854 3855 3856 3857 3858 3859
static int do_allocation(struct btrfs_block_group *block_group,
			 struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_block_group **bg_ret)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3860 3861
	case BTRFS_EXTENT_ALLOC_ZONED:
		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3862 3863 3864 3865 3866
	default:
		BUG();
	}
}

3867 3868 3869 3870 3871 3872 3873 3874 3875
static void release_block_group(struct btrfs_block_group *block_group,
				struct find_free_extent_ctl *ffe_ctl,
				int delalloc)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		ffe_ctl->retry_clustered = false;
		ffe_ctl->retry_unclustered = false;
		break;
3876 3877 3878
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3879 3880 3881 3882 3883 3884 3885 3886 3887
	default:
		BUG();
	}

	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
	       ffe_ctl->index);
	btrfs_release_block_group(block_group, delalloc);
}

3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906
static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_key *ins)
{
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

	if (!ffe_ctl->use_cluster && last_ptr) {
		spin_lock(&last_ptr->lock);
		last_ptr->window_start = ins->objectid;
		spin_unlock(&last_ptr->lock);
	}
}

static void found_extent(struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		found_extent_clustered(ffe_ctl, ins);
		break;
3907 3908 3909
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3910 3911 3912 3913 3914
	default:
		BUG();
	}
}

3915 3916 3917 3918 3919 3920 3921 3922 3923 3924
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		/*
		 * If we can't allocate a new chunk we've already looped through
		 * at least once, move on to the NO_EMPTY_SIZE case.
		 */
		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
		return 0;
3925 3926 3927
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Give up here */
		return -ENOSPC;
3928 3929 3930 3931 3932
	default:
		BUG();
	}
}

3933 3934 3935 3936 3937 3938 3939 3940
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
3941
					bool full_search)
3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957
{
	struct btrfs_root *root = fs_info->extent_root;
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	    ffe_ctl->have_caching_bg)
		return 1;

	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
		return 1;

	if (ins->objectid) {
3958
		found_extent(ffe_ctl, ins);
3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000
		return 0;
	}

	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

4001 4002
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
4003 4004

			/* Do not bail out on ENOSPC since we can do more. */
4005 4006 4007
			if (ret == -ENOSPC)
				ret = chunk_allocation_failed(ffe_ctl);
			else if (ret < 0)
4008 4009 4010 4011 4012 4013 4014 4015 4016 4017
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4018 4019 4020
			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
				return -ENOSPC;

4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
					struct find_free_extent_ctl *ffe_ctl,
					struct btrfs_space_info *space_info,
					struct btrfs_key *ins)
{
	/*
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
	 */
	if (space_info->max_extent_size) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    ffe_ctl->num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&space_info->lock);
	}

	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
					       &ffe_ctl->empty_cluster);
	if (ffe_ctl->last_ptr) {
		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			ffe_ctl->hint_byte = last_ptr->window_start;
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			ffe_ctl->hint_byte = last_ptr->window_start;
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&last_ptr->lock);
	}

	return 0;
}

static int prepare_allocation(struct btrfs_fs_info *fs_info,
			      struct find_free_extent_ctl *ffe_ctl,
			      struct btrfs_space_info *space_info,
			      struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return prepare_allocation_clustered(fs_info, ffe_ctl,
						    space_info, ins);
4096
	case BTRFS_EXTENT_ALLOC_ZONED:
4097 4098 4099 4100 4101 4102
		if (ffe_ctl->for_treelog) {
			spin_lock(&fs_info->treelog_bg_lock);
			if (fs_info->treelog_bg)
				ffe_ctl->hint_byte = fs_info->treelog_bg;
			spin_unlock(&fs_info->treelog_bg_lock);
		}
4103
		return 0;
4104 4105 4106 4107 4108
	default:
		BUG();
	}
}

4109 4110 4111
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
4112
 * ins->objectid == start position
4113
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4114
 * ins->offset == the size of the hole.
4115
 * Any available blocks before search_start are skipped.
4116 4117 4118
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
4133
 */
4134
static noinline int find_free_extent(struct btrfs_root *root,
4135
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
4136
				u64 hint_byte_orig, struct btrfs_key *ins,
4137
				u64 flags, int delalloc)
4138
{
4139
	struct btrfs_fs_info *fs_info = root->fs_info;
4140
	int ret = 0;
4141
	int cache_block_group_error = 0;
4142
	struct btrfs_block_group *block_group = NULL;
4143
	struct find_free_extent_ctl ffe_ctl = {0};
4144
	struct btrfs_space_info *space_info;
4145
	bool full_search = false;
4146
	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4147

4148
	WARN_ON(num_bytes < fs_info->sectorsize);
4149 4150 4151 4152 4153 4154 4155 4156 4157 4158

	ffe_ctl.num_bytes = num_bytes;
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.search_start = 0;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	ffe_ctl.have_caching_bg = false;
	ffe_ctl.orig_have_caching_bg = false;
	ffe_ctl.found_offset = 0;
4159
	ffe_ctl.hint_byte = hint_byte_orig;
4160
	ffe_ctl.for_treelog = for_treelog;
4161
	ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4162

4163 4164 4165 4166 4167 4168
	/* For clustered allocation */
	ffe_ctl.retry_clustered = false;
	ffe_ctl.retry_unclustered = false;
	ffe_ctl.last_ptr = NULL;
	ffe_ctl.use_cluster = true;

4169 4170 4171
	if (btrfs_is_zoned(fs_info))
		ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;

4172
	ins->type = BTRFS_EXTENT_ITEM_KEY;
4173 4174
	ins->objectid = 0;
	ins->offset = 0;
4175

4176
	trace_find_free_extent(root, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
4177

4178
	space_info = btrfs_find_space_info(fs_info, flags);
4179
	if (!space_info) {
4180
		btrfs_err(fs_info, "No space info for %llu", flags);
4181 4182
		return -ENOSPC;
	}
J
Josef Bacik 已提交
4183

4184 4185 4186
	ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
	if (ret < 0)
		return ret;
4187

4188 4189
	ffe_ctl.search_start = max(ffe_ctl.search_start,
				   first_logical_byte(fs_info, 0));
4190 4191
	ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
	if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4192 4193
		block_group = btrfs_lookup_block_group(fs_info,
						       ffe_ctl.search_start);
J
Josef Bacik 已提交
4194 4195 4196
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
4197 4198 4199
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
4200
		 */
4201
		if (block_group && block_group_bits(block_group, flags) &&
4202
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
4203
			down_read(&space_info->groups_sem);
4204 4205 4206 4207 4208 4209 4210 4211 4212 4213
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
4214
			} else {
4215
				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
4216
						block_group->flags);
4217
				btrfs_lock_block_group(block_group, delalloc);
4218
				goto have_block_group;
4219
			}
J
Josef Bacik 已提交
4220
		} else if (block_group) {
4221
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
4222
		}
4223
	}
J
Josef Bacik 已提交
4224
search:
4225 4226 4227
	ffe_ctl.have_caching_bg = false;
	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
	    ffe_ctl.index == 0)
4228
		full_search = true;
4229
	down_read(&space_info->groups_sem);
4230 4231
	list_for_each_entry(block_group,
			    &space_info->block_groups[ffe_ctl.index], list) {
4232 4233
		struct btrfs_block_group *bg_ret;

4234
		/* If the block group is read-only, we can skip it entirely. */
4235 4236 4237
		if (unlikely(block_group->ro)) {
			if (for_treelog)
				btrfs_clear_treelog_bg(block_group);
4238
			continue;
4239
		}
4240

4241
		btrfs_grab_block_group(block_group, delalloc);
4242
		ffe_ctl.search_start = block_group->start;
4243

4244 4245 4246 4247 4248
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
4249
		if (!block_group_bits(block_group, flags)) {
4250
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4251
				BTRFS_BLOCK_GROUP_RAID1_MASK |
4252
				BTRFS_BLOCK_GROUP_RAID56_MASK |
4253 4254 4255 4256 4257 4258 4259
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
4260
			if ((flags & extra) && !(block_group->flags & extra))
4261
				goto loop;
4262 4263 4264 4265 4266 4267 4268 4269

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
			btrfs_release_block_group(block_group, delalloc);
			continue;
4270 4271
		}

J
Josef Bacik 已提交
4272
have_block_group:
4273
		ffe_ctl.cached = btrfs_block_group_done(block_group);
4274 4275
		if (unlikely(!ffe_ctl.cached)) {
			ffe_ctl.have_caching_bg = true;
4276
			ret = btrfs_cache_block_group(block_group, 0);
4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290

			/*
			 * If we get ENOMEM here or something else we want to
			 * try other block groups, because it may not be fatal.
			 * However if we can't find anything else we need to
			 * save our return here so that we return the actual
			 * error that caused problems, not ENOSPC.
			 */
			if (ret < 0) {
				if (!cache_block_group_error)
					cache_block_group_error = ret;
				ret = 0;
				goto loop;
			}
4291
			ret = 0;
J
Josef Bacik 已提交
4292 4293
		}

4294 4295
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
4296

4297 4298 4299 4300 4301 4302
		bg_ret = NULL;
		ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
		if (ret == 0) {
			if (bg_ret && bg_ret != block_group) {
				btrfs_release_block_group(block_group, delalloc);
				block_group = bg_ret;
4303
			}
4304
		} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
4305
			goto have_block_group;
4306
		} else if (ret > 0) {
4307
			goto loop;
4308 4309 4310
		}

		/* Checks */
4311 4312
		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
					     fs_info->stripesize);
4313

J
Josef Bacik 已提交
4314
		/* move on to the next group */
4315
		if (ffe_ctl.search_start + num_bytes >
4316
		    block_group->start + block_group->length) {
4317 4318
			btrfs_add_free_space_unused(block_group,
					    ffe_ctl.found_offset, num_bytes);
J
Josef Bacik 已提交
4319
			goto loop;
4320
		}
4321

4322
		if (ffe_ctl.found_offset < ffe_ctl.search_start)
4323 4324 4325
			btrfs_add_free_space_unused(block_group,
					ffe_ctl.found_offset,
					ffe_ctl.search_start - ffe_ctl.found_offset);
J
Josef Bacik 已提交
4326

4327 4328
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
4329
		if (ret == -EAGAIN) {
4330 4331
			btrfs_add_free_space_unused(block_group,
					ffe_ctl.found_offset, num_bytes);
J
Josef Bacik 已提交
4332
			goto loop;
J
Josef Bacik 已提交
4333
		}
4334
		btrfs_inc_block_group_reservations(block_group);
4335

4336
		/* we are all good, lets return */
4337
		ins->objectid = ffe_ctl.search_start;
J
Josef Bacik 已提交
4338
		ins->offset = num_bytes;
4339

4340 4341
		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
					   num_bytes);
4342
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
4343 4344
		break;
loop:
4345
		release_block_group(block_group, &ffe_ctl, delalloc);
4346
		cond_resched();
J
Josef Bacik 已提交
4347 4348 4349
	}
	up_read(&space_info->groups_sem);

4350
	ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4351
	if (ret > 0)
4352 4353
		goto search;

4354
	if (ret == -ENOSPC && !cache_block_group_error) {
4355 4356 4357 4358 4359 4360
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
		if (!ffe_ctl.max_extent_size)
			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4361
		spin_lock(&space_info->lock);
4362
		space_info->max_extent_size = ffe_ctl.max_extent_size;
4363
		spin_unlock(&space_info->lock);
4364
		ins->offset = ffe_ctl.max_extent_size;
4365 4366
	} else if (ret == -ENOSPC) {
		ret = cache_block_group_error;
4367
	}
C
Chris Mason 已提交
4368
	return ret;
4369
}
4370

4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4416
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4417 4418
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4419
			 struct btrfs_key *ins, int is_data, int delalloc)
4420
{
4421
	struct btrfs_fs_info *fs_info = root->fs_info;
4422
	bool final_tried = num_bytes == min_alloc_size;
4423
	u64 flags;
4424
	int ret;
4425
	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4426

4427
	flags = get_alloc_profile_by_root(root, is_data);
4428
again:
4429
	WARN_ON(num_bytes < fs_info->sectorsize);
4430
	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
4431
			       hint_byte, ins, flags, delalloc);
4432
	if (!ret && !is_data) {
4433
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4434
	} else if (ret == -ENOSPC) {
4435 4436
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4437
			num_bytes = round_down(num_bytes,
4438
					       fs_info->sectorsize);
4439
			num_bytes = max(num_bytes, min_alloc_size);
4440
			ram_bytes = num_bytes;
4441 4442 4443
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4444
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4445 4446
			struct btrfs_space_info *sinfo;

4447
			sinfo = btrfs_find_space_info(fs_info, flags);
4448
			btrfs_err(fs_info,
4449 4450
			"allocation failed flags %llu, wanted %llu tree-log %d",
				  flags, num_bytes, for_treelog);
4451
			if (sinfo)
4452 4453
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4454
		}
4455
	}
J
Josef Bacik 已提交
4456 4457

	return ret;
4458 4459
}

4460 4461
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
			       u64 start, u64 len, int delalloc)
4462
{
4463
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
4464

4465
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4466
	if (!cache) {
4467 4468
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4469 4470
		return -ENOSPC;
	}
4471

4472 4473 4474 4475
	btrfs_add_free_space(cache, start, len);
	btrfs_free_reserved_bytes(cache, len, delalloc);
	trace_btrfs_reserved_extent_free(fs_info, start, len);

4476
	btrfs_put_block_group(cache);
4477
	return 0;
4478 4479
}

4480 4481
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
			      u64 len)
4482
{
4483
	struct btrfs_block_group *cache;
4484
	int ret = 0;
4485

4486
	cache = btrfs_lookup_block_group(trans->fs_info, start);
4487
	if (!cache) {
4488 4489
		btrfs_err(trans->fs_info, "unable to find block group for %llu",
			  start);
4490 4491 4492
		return -ENOSPC;
	}

4493
	ret = pin_down_extent(trans, cache, start, len, 1);
4494
	btrfs_put_block_group(cache);
4495
	return ret;
4496 4497
}

4498 4499 4500 4501
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4502
{
4503
	struct btrfs_fs_info *fs_info = trans->fs_info;
4504 4505
	int ret;
	struct btrfs_extent_item *extent_item;
4506
	struct btrfs_extent_inline_ref *iref;
4507
	struct btrfs_path *path;
4508 4509 4510
	struct extent_buffer *leaf;
	int type;
	u32 size;
4511

4512 4513 4514 4515
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4516

4517
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4518 4519

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4520 4521
	if (!path)
		return -ENOMEM;
4522

4523 4524
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
4525 4526 4527 4528
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4529

4530 4531
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4532
				     struct btrfs_extent_item);
4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4553 4554

	btrfs_mark_buffer_dirty(path->nodes[0]);
4555
	btrfs_free_path(path);
4556

4557
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4558 4559 4560
	if (ret)
		return ret;

4561
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4562
	if (ret) { /* -ENOENT, logic error */
4563
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4564
			ins->objectid, ins->offset);
4565 4566
		BUG();
	}
4567
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4568 4569 4570
	return ret;
}

4571
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4572
				     struct btrfs_delayed_ref_node *node,
4573
				     struct btrfs_delayed_extent_op *extent_op)
4574
{
4575
	struct btrfs_fs_info *fs_info = trans->fs_info;
4576
	int ret;
4577
	struct btrfs_extent_item *extent_item;
4578
	struct btrfs_key extent_key;
4579 4580 4581 4582
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4583
	struct btrfs_delayed_tree_ref *ref;
4584
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4585
	u64 num_bytes;
4586
	u64 flags = extent_op->flags_to_set;
4587
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4588

4589 4590 4591 4592 4593 4594 4595 4596 4597 4598
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4599
		size += sizeof(*block_info);
4600 4601
		num_bytes = node->num_bytes;
	}
4602

4603
	path = btrfs_alloc_path();
4604
	if (!path)
4605
		return -ENOMEM;
4606

4607
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4608
				      &extent_key, size);
4609
	if (ret) {
4610
		btrfs_free_path(path);
4611 4612
		return ret;
	}
4613 4614 4615 4616 4617 4618 4619 4620 4621

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4622 4623 4624 4625
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4626
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4627
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4628 4629
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4630

4631
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4632 4633
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4634
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4635 4636 4637
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4638
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4639 4640 4641 4642 4643
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4644 4645
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4646 4647 4648
	if (ret)
		return ret;

4649 4650
	ret = btrfs_update_block_group(trans, extent_key.objectid,
				       fs_info->nodesize, 1);
4651
	if (ret) { /* -ENOENT, logic error */
4652
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4653
			extent_key.objectid, extent_key.offset);
4654 4655
		BUG();
	}
J
Josef Bacik 已提交
4656

4657
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4658
					  fs_info->nodesize);
4659 4660 4661 4662
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4663
				     struct btrfs_root *root, u64 owner,
4664 4665
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4666
{
4667
	struct btrfs_ref generic_ref = { 0 };
4668

4669
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4670

4671 4672 4673
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4674
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4675 4676

	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4677
}
4678 4679 4680 4681 4682 4683

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4684 4685 4686
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4687
{
4688
	struct btrfs_fs_info *fs_info = trans->fs_info;
4689
	int ret;
4690
	struct btrfs_block_group *block_group;
4691
	struct btrfs_space_info *space_info;
4692

4693 4694
	/*
	 * Mixed block groups will exclude before processing the log so we only
4695
	 * need to do the exclude dance if this fs isn't mixed.
4696
	 */
4697
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4698 4699
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4700
		if (ret)
4701
			return ret;
4702 4703
	}

4704
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4705 4706 4707
	if (!block_group)
		return -EINVAL;

4708 4709 4710 4711 4712 4713 4714 4715
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4716 4717
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4718
	if (ret)
4719
		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4720
	btrfs_put_block_group(block_group);
4721 4722 4723
	return ret;
}

4724 4725
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4726 4727
		      u64 bytenr, int level, u64 owner,
		      enum btrfs_lock_nesting nest)
4728
{
4729
	struct btrfs_fs_info *fs_info = root->fs_info;
4730 4731
	struct extent_buffer *buf;

4732
	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4733 4734 4735
	if (IS_ERR(buf))
		return buf;

4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4749 4750 4751 4752 4753
	/*
	 * This needs to stay, because we could allocate a freed block from an
	 * old tree into a new tree, so we need to make sure this new block is
	 * set to the appropriate level and owner.
	 */
4754
	btrfs_set_buffer_lockdep_class(owner, buf, level);
4755
	__btrfs_tree_lock(buf, nest);
4756
	btrfs_clean_tree_block(buf);
4757
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4758
	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4759

4760
	set_extent_buffer_uptodate(buf);
4761

4762 4763 4764 4765 4766 4767
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4768
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4769
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4770
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4771
		buf->log_index = root->log_transid % 2;
4772 4773
		/*
		 * we allow two log transactions at a time, use different
4774
		 * EXTENT bit to differentiate dirty pages.
4775
		 */
4776
		if (buf->log_index == 0)
4777 4778 4779 4780
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4781
					buf->start + buf->len - 1);
4782
	} else {
4783
		buf->log_index = -1;
4784
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4785
			 buf->start + buf->len - 1, GFP_NOFS);
4786
	}
4787
	/* this returns a buffer locked for blocking */
4788 4789 4790
	return buf;
}

4791
/*
4792
 * finds a free extent and does all the dirty work required for allocation
4793
 * returns the tree buffer or an ERR_PTR on error.
4794
 */
4795
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4796 4797 4798 4799
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
4800 4801
					     u64 empty_size,
					     enum btrfs_lock_nesting nest)
4802
{
4803
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4804
	struct btrfs_key ins;
4805
	struct btrfs_block_rsv *block_rsv;
4806
	struct extent_buffer *buf;
4807
	struct btrfs_delayed_extent_op *extent_op;
4808
	struct btrfs_ref generic_ref = { 0 };
4809 4810
	u64 flags = 0;
	int ret;
4811 4812
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4813

4814
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4815
	if (btrfs_is_testing(fs_info)) {
4816
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4817
					    level, root_objectid, nest);
4818 4819 4820 4821
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4822
#endif
4823

4824
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4825 4826 4827
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4828
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4829
				   empty_size, hint, &ins, 0, 0);
4830 4831
	if (ret)
		goto out_unuse;
4832

4833
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4834
				    root_objectid, nest);
4835 4836 4837 4838
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4839 4840 4841 4842 4843 4844 4845 4846 4847

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4848
		extent_op = btrfs_alloc_delayed_extent_op();
4849 4850 4851 4852
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4853 4854 4855 4856 4857
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4858 4859 4860
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4861
		extent_op->level = level;
4862

4863 4864 4865 4866
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
		generic_ref.real_root = root->root_key.objectid;
		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4867
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4868
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4869 4870
		if (ret)
			goto out_free_delayed;
4871
	}
4872
	return buf;
4873 4874 4875 4876 4877 4878

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
4879
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4880
out_unuse:
4881
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4882
	return ERR_PTR(ret);
4883
}
4884

4885 4886 4887 4888
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
4889 4890
	struct btrfs_key drop_progress;
	int drop_level;
4891 4892 4893 4894 4895
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
4896 4897
	int reada_slot;
	int reada_count;
4898
	int restarted;
4899 4900 4901 4902 4903
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
4904 4905 4906 4907
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
4908
{
4909
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4910 4911 4912
	u64 bytenr;
	u64 generation;
	u64 refs;
4913
	u64 flags;
4914
	u32 nritems;
Y
Yan, Zheng 已提交
4915 4916
	struct btrfs_key key;
	struct extent_buffer *eb;
4917
	int ret;
Y
Yan, Zheng 已提交
4918 4919
	int slot;
	int nread = 0;
4920

Y
Yan, Zheng 已提交
4921 4922 4923 4924 4925 4926
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
4927
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
4928
	}
4929

Y
Yan, Zheng 已提交
4930 4931
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
4932

Y
Yan, Zheng 已提交
4933 4934 4935
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
4936

C
Chris Mason 已提交
4937
		cond_resched();
Y
Yan, Zheng 已提交
4938 4939
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
4940

Y
Yan, Zheng 已提交
4941 4942
		if (slot == path->slots[wc->level])
			goto reada;
4943

Y
Yan, Zheng 已提交
4944 4945
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
4946 4947
			continue;

4948
		/* We don't lock the tree block, it's OK to be racy here */
4949
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4950 4951
					       wc->level - 1, 1, &refs,
					       &flags);
4952 4953 4954
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
4955 4956
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
4957 4958 4959
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
4960

4961 4962 4963
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
4964 4965 4966 4967 4968 4969 4970 4971
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
4972 4973 4974 4975
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
4976
		}
Y
Yan, Zheng 已提交
4977
reada:
4978
		btrfs_readahead_node_child(eb, slot);
Y
Yan, Zheng 已提交
4979
		nread++;
C
Chris Mason 已提交
4980
	}
Y
Yan, Zheng 已提交
4981
	wc->reada_slot = slot;
C
Chris Mason 已提交
4982
}
4983

Y
Yan Zheng 已提交
4984
/*
L
Liu Bo 已提交
4985
 * helper to process tree block while walking down the tree.
4986 4987 4988 4989 4990
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
4991
 */
4992
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4993
				   struct btrfs_root *root,
4994
				   struct btrfs_path *path,
4995
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
4996
{
4997
	struct btrfs_fs_info *fs_info = root->fs_info;
4998 4999 5000
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
5001 5002
	int ret;

5003 5004 5005
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
5006

5007 5008 5009 5010
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
5011 5012 5013
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5014
		BUG_ON(!path->locks[level]);
5015
		ret = btrfs_lookup_extent_info(trans, fs_info,
5016
					       eb->start, level, 1,
5017 5018
					       &wc->refs[level],
					       &wc->flags[level]);
5019 5020 5021
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
5022 5023
		BUG_ON(wc->refs[level] == 0);
	}
5024

5025 5026 5027
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
5028

5029
		if (path->locks[level] && !wc->keep_locks) {
5030
			btrfs_tree_unlock_rw(eb, path->locks[level]);
5031 5032 5033 5034
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
5035

5036 5037 5038
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
5039
		ret = btrfs_inc_ref(trans, root, eb, 1);
5040
		BUG_ON(ret); /* -ENOMEM */
5041
		ret = btrfs_dec_ref(trans, root, eb, 0);
5042
		BUG_ON(ret); /* -ENOMEM */
5043
		ret = btrfs_set_disk_extent_flags(trans, eb, flag,
5044
						  btrfs_header_level(eb), 0);
5045
		BUG_ON(ret); /* -ENOMEM */
5046 5047 5048 5049 5050 5051 5052 5053
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
5054
		btrfs_tree_unlock_rw(eb, path->locks[level]);
5055 5056 5057 5058 5059
		path->locks[level] = 0;
	}
	return 0;
}

5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
5087
/*
L
Liu Bo 已提交
5088
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
5103
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
5104
{
5105
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
5106 5107 5108 5109
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
5110
	struct btrfs_key first_key;
5111
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
5112 5113 5114 5115
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
5116
	bool need_account = false;
Y
Yan, Zheng 已提交
5117 5118 5119 5120 5121 5122 5123 5124 5125

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
5126 5127
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5128
		return 1;
5129
	}
Y
Yan, Zheng 已提交
5130 5131

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5132 5133
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
5134

5135
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
5136
	if (!next) {
5137 5138
		next = btrfs_find_create_tree_block(fs_info, bytenr,
				root->root_key.objectid, level - 1);
5139 5140
		if (IS_ERR(next))
			return PTR_ERR(next);
Y
Yan, Zheng 已提交
5141 5142 5143 5144
		reada = 1;
	}
	btrfs_tree_lock(next);

5145
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5146 5147
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
5148 5149
	if (ret < 0)
		goto out_unlock;
5150

5151
	if (unlikely(wc->refs[level - 1] == 0)) {
5152
		btrfs_err(fs_info, "Missing references.");
5153 5154
		ret = -EIO;
		goto out_unlock;
5155
	}
5156
	*lookup_info = 0;
Y
Yan, Zheng 已提交
5157

5158
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
5159
		if (wc->refs[level - 1] > 1) {
5160
			need_account = true;
5161 5162 5163 5164
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
5178 5179 5180 5181
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
5182 5183
	}

5184
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
5185 5186 5187
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
5188
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5189 5190 5191 5192 5193
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
5194 5195
		next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
				       generation, level - 1, &first_key);
5196 5197 5198
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
5199
			free_extent_buffer(next);
5200
			return -EIO;
5201
		}
Y
Yan, Zheng 已提交
5202 5203 5204 5205
		btrfs_tree_lock(next);
	}

	level--;
5206 5207 5208 5209 5210 5211
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
5212 5213
	path->nodes[level] = next;
	path->slots[level] = 0;
5214
	path->locks[level] = BTRFS_WRITE_LOCK;
Y
Yan, Zheng 已提交
5215 5216 5217 5218 5219 5220 5221
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
5222 5223 5224 5225
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
5226
			ASSERT(root->root_key.objectid ==
5227
			       btrfs_header_owner(path->nodes[level]));
5228 5229 5230 5231 5232 5233 5234
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
5235 5236
			parent = 0;
		}
Y
Yan, Zheng 已提交
5237

5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

5255 5256 5257 5258 5259 5260 5261
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
5262
			ret = btrfs_qgroup_trace_subtree(trans, next,
5263
							 generation, level - 1);
5264
			if (ret) {
5265
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
5266 5267
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
5268 5269
			}
		}
5270 5271 5272 5273 5274 5275 5276 5277 5278 5279

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

5280 5281 5282 5283
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
		ret = btrfs_free_extent(trans, &ref);
5284 5285
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
5286
	}
5287
no_delete:
5288 5289 5290 5291
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
5292 5293
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
5294 5295

	return ret;
Y
Yan, Zheng 已提交
5296 5297
}

5298
/*
L
Liu Bo 已提交
5299
 * helper to process tree block while walking up the tree.
5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
5315
	struct btrfs_fs_info *fs_info = root->fs_info;
5316
	int ret;
5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5342
			path->locks[level] = BTRFS_WRITE_LOCK;
5343

5344
			ret = btrfs_lookup_extent_info(trans, fs_info,
5345
						       eb->start, level, 1,
5346 5347
						       &wc->refs[level],
						       &wc->flags[level]);
5348 5349
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5350
				path->locks[level] = 0;
5351 5352
				return ret;
			}
5353 5354
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5355
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5356
				path->locks[level] = 0;
5357 5358
				return 1;
			}
Y
Yan Zheng 已提交
5359
		}
5360
	}
Y
Yan Zheng 已提交
5361

5362 5363
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5364

5365 5366 5367
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5368
				ret = btrfs_dec_ref(trans, root, eb, 1);
5369
			else
5370
				ret = btrfs_dec_ref(trans, root, eb, 0);
5371
			BUG_ON(ret); /* -ENOMEM */
5372 5373 5374 5375 5376
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5377
					     ret);
5378
				}
5379
			}
5380
		}
5381
		/* make block locked assertion in btrfs_clean_tree_block happy */
5382 5383 5384
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5385
			path->locks[level] = BTRFS_WRITE_LOCK;
5386
		}
5387
		btrfs_clean_tree_block(eb);
5388 5389 5390 5391 5392
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5393 5394
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5395 5396 5397
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5398 5399 5400
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5401 5402
	}

5403
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5404 5405 5406
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5407
	return 0;
5408 5409 5410 5411 5412

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5413 5414 5415 5416 5417 5418 5419 5420
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5421
	int lookup_info = 1;
5422 5423 5424
	int ret;

	while (level >= 0) {
5425
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5426 5427 5428 5429 5430 5431
		if (ret > 0)
			break;

		if (level == 0)
			break;

5432 5433 5434 5435
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5436
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5437 5438 5439
		if (ret > 0) {
			path->slots[level]++;
			continue;
5440 5441
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5442
		level = wc->level;
Y
Yan Zheng 已提交
5443 5444 5445 5446
	}
	return 0;
}

C
Chris Mason 已提交
5447
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5448
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5449
				 struct btrfs_path *path,
5450
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5451
{
5452
	int level = wc->level;
C
Chris Mason 已提交
5453
	int ret;
5454

5455 5456 5457 5458 5459 5460
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5461 5462
			return 0;
		} else {
5463 5464 5465
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5466 5467
			if (ret < 0)
				return ret;
5468

5469
			if (path->locks[level]) {
5470 5471
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5472
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5473
			}
5474 5475 5476
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5477 5478 5479 5480 5481
		}
	}
	return 1;
}

C
Chris Mason 已提交
5482
/*
5483 5484 5485 5486 5487 5488 5489 5490 5491
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5492 5493
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5494
 */
5495
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
C
Chris Mason 已提交
5496
{
5497
	struct btrfs_fs_info *fs_info = root->fs_info;
5498
	struct btrfs_path *path;
5499
	struct btrfs_trans_handle *trans;
5500
	struct btrfs_root *tree_root = fs_info->tree_root;
5501
	struct btrfs_root_item *root_item = &root->root_item;
5502 5503 5504 5505 5506
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5507
	bool root_dropped = false;
C
Chris Mason 已提交
5508

5509
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5510

5511
	path = btrfs_alloc_path();
5512 5513 5514 5515
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5516

5517
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5518 5519
	if (!wc) {
		btrfs_free_path(path);
5520 5521
		err = -ENOMEM;
		goto out;
5522
	}
5523

5524 5525 5526 5527 5528 5529 5530 5531
	/*
	 * Use join to avoid potential EINTR from transaction start. See
	 * wait_reserve_ticket and the whole reservation callchain.
	 */
	if (for_reloc)
		trans = btrfs_join_transaction(tree_root);
	else
		trans = btrfs_start_transaction(tree_root, 0);
5532 5533 5534 5535
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5536

5537 5538 5539 5540
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5541 5542 5543 5544 5545 5546 5547 5548 5549
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5550
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5551
		level = btrfs_header_level(root->node);
5552
		path->nodes[level] = btrfs_lock_root_node(root);
5553
		path->slots[level] = 0;
5554
		path->locks[level] = BTRFS_WRITE_LOCK;
5555 5556
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5557 5558
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5559 5560 5561
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5562
		level = btrfs_root_drop_level(root_item);
5563
		BUG_ON(level == 0);
5564
		path->lowest_level = level;
5565 5566 5567 5568
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5569
			goto out_end_trans;
5570
		}
Y
Yan, Zheng 已提交
5571
		WARN_ON(ret > 0);
5572

5573 5574 5575 5576
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5577
		btrfs_unlock_up_safe(path, 0);
5578 5579 5580 5581

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5582
			path->locks[level] = BTRFS_WRITE_LOCK;
5583

5584
			ret = btrfs_lookup_extent_info(trans, fs_info,
5585
						path->nodes[level]->start,
5586
						level, 1, &wc->refs[level],
5587
						&wc->flags[level]);
5588 5589 5590 5591
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5592 5593
			BUG_ON(wc->refs[level] == 0);

5594
			if (level == btrfs_root_drop_level(root_item))
5595 5596 5597
				break;

			btrfs_tree_unlock(path->nodes[level]);
5598
			path->locks[level] = 0;
5599 5600 5601
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5602
	}
5603

5604
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5605 5606 5607 5608 5609
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5610
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5611

C
Chris Mason 已提交
5612
	while (1) {
D
David Sterba 已提交
5613

5614 5615 5616
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5617
			break;
5618
		}
C
Chris Mason 已提交
5619

5620 5621 5622
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5623
			break;
5624 5625 5626 5627
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5628 5629
			break;
		}
5630 5631

		if (wc->stage == DROP_REFERENCE) {
5632 5633 5634 5635 5636 5637 5638
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
5639
		btrfs_set_root_drop_level(root_item, wc->drop_level);
5640 5641

		BUG_ON(wc->level == 0);
5642
		if (btrfs_should_end_transaction(trans) ||
5643
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5644 5645 5646
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5647
			if (ret) {
5648
				btrfs_abort_transaction(trans, ret);
5649 5650 5651
				err = ret;
				goto out_end_trans;
			}
5652

5653
			btrfs_end_transaction_throttle(trans);
5654
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5655 5656
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5657 5658 5659 5660
				err = -EAGAIN;
				goto out_free;
			}

5661 5662 5663 5664 5665 5666 5667 5668 5669
		       /*
			* Use join to avoid potential EINTR from transaction
			* start. See wait_reserve_ticket and the whole
			* reservation callchain.
			*/
			if (for_reloc)
				trans = btrfs_join_transaction(tree_root);
			else
				trans = btrfs_start_transaction(tree_root, 0);
5670 5671 5672 5673
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5674
		}
C
Chris Mason 已提交
5675
	}
5676
	btrfs_release_path(path);
5677 5678
	if (err)
		goto out_end_trans;
5679

5680
	ret = btrfs_del_root(trans, &root->root_key);
5681
	if (ret) {
5682
		btrfs_abort_transaction(trans, ret);
5683
		err = ret;
5684 5685
		goto out_end_trans;
	}
5686

5687
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5688 5689
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5690
		if (ret < 0) {
5691
			btrfs_abort_transaction(trans, ret);
5692 5693 5694
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5695 5696 5697 5698 5699 5700 5701
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5702 5703 5704
		}
	}

5705 5706 5707 5708 5709 5710 5711 5712
	/*
	 * This subvolume is going to be completely dropped, and won't be
	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
	 * commit transaction time.  So free it here manually.
	 */
	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
	btrfs_qgroup_free_meta_all_pertrans(root);

5713
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5714
		btrfs_add_dropped_root(trans, root);
5715
	else
5716
		btrfs_put_root(root);
5717
	root_dropped = true;
5718
out_end_trans:
5719
	btrfs_end_transaction_throttle(trans);
5720
out_free:
5721
	kfree(wc);
5722
	btrfs_free_path(path);
5723
out:
5724 5725 5726 5727 5728 5729 5730
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5731
	if (!for_reloc && !root_dropped)
5732
		btrfs_add_dead_root(root);
5733
	return err;
C
Chris Mason 已提交
5734
}
C
Chris Mason 已提交
5735

5736 5737 5738 5739
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5740
 * only used by relocation code
5741
 */
Y
Yan Zheng 已提交
5742 5743 5744 5745 5746
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5747
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5748
	struct btrfs_path *path;
5749
	struct walk_control *wc;
Y
Yan Zheng 已提交
5750 5751 5752 5753 5754
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5755 5756
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5757
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5758 5759
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5760

5761
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5762 5763 5764 5765
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5766

5767
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
5768
	parent_level = btrfs_header_level(parent);
D
David Sterba 已提交
5769
	atomic_inc(&parent->refs);
Y
Yan Zheng 已提交
5770 5771 5772
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5773
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
5774 5775 5776
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5777
	path->locks[level] = BTRFS_WRITE_LOCK;
5778 5779 5780 5781 5782 5783 5784 5785

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5786
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5787 5788

	while (1) {
5789 5790
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5791 5792
			ret = wret;
			break;
5793
		}
Y
Yan Zheng 已提交
5794

5795
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5796 5797 5798 5799 5800 5801
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5802
	kfree(wc);
Y
Yan Zheng 已提交
5803 5804 5805 5806
	btrfs_free_path(path);
	return ret;
}

5807 5808
/*
 * helper to account the unused space of all the readonly block group in the
5809
 * space_info. takes mirrors into account.
5810
 */
5811
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5812
{
5813
	struct btrfs_block_group *block_group;
5814 5815 5816
	u64 free_bytes = 0;
	int factor;

5817
	/* It's df, we don't care if it's racy */
5818 5819 5820 5821 5822
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5823 5824 5825 5826 5827 5828 5829
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5830
		factor = btrfs_bg_type_to_factor(block_group->flags);
5831
		free_bytes += (block_group->length -
5832
			       block_group->used) * factor;
5833 5834 5835 5836 5837 5838 5839 5840

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5841 5842
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5843
{
5844
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5845 5846
}

5847 5848 5849 5850 5851 5852 5853 5854 5855
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5856 5857
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5858 5859 5860 5861 5862
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5863 5864 5865
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5866
 */
5867
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5868
{
5869
	u64 start = SZ_1M, len = 0, end = 0;
5870 5871 5872 5873
	int ret;

	*trimmed = 0;

5874 5875 5876 5877
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5878
	/* Not writable = nothing to do. */
5879
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5880 5881 5882 5883 5884 5885 5886 5887 5888
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
5889
		struct btrfs_fs_info *fs_info = device->fs_info;
5890 5891 5892 5893
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
5894
			break;
5895

5896 5897 5898
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5899

5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912
		/* Check if there are any CHUNK_* bits left */
		if (start > device->total_bytes) {
			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
			btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
					  start, end - start + 1,
					  rcu_str_deref(device->name),
					  device->total_bytes);
			mutex_unlock(&fs_info->chunk_mutex);
			ret = 0;
			break;
		}

5913 5914 5915
		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

5916 5917 5918 5919 5920 5921
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
5922

5923
		len = end - start + 1;
5924

5925 5926
		/* We didn't find any extents */
		if (!len) {
5927
			mutex_unlock(&fs_info->chunk_mutex);
5928
			ret = 0;
5929 5930 5931
			break;
		}

5932 5933 5934 5935 5936 5937
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

5957 5958 5959 5960 5961 5962 5963 5964 5965
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
5966
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5967
{
5968
	struct btrfs_block_group *cache = NULL;
5969 5970
	struct btrfs_device *device;
	struct list_head *devices;
5971
	u64 group_trimmed;
5972
	u64 range_end = U64_MAX;
5973 5974 5975
	u64 start;
	u64 end;
	u64 trimmed = 0;
5976 5977 5978 5979
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
5980 5981
	int ret = 0;

5982 5983 5984 5985 5986 5987 5988 5989
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

5990
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5991
	for (; cache; cache = btrfs_next_block_group(cache)) {
5992
		if (cache->start >= range_end) {
5993 5994 5995 5996
			btrfs_put_block_group(cache);
			break;
		}

5997 5998
		start = max(range->start, cache->start);
		end = min(range_end, cache->start + cache->length);
5999 6000

		if (end - start >= range->minlen) {
6001
			if (!btrfs_block_group_done(cache)) {
6002
				ret = btrfs_cache_block_group(cache, 0);
6003
				if (ret) {
6004 6005 6006
					bg_failed++;
					bg_ret = ret;
					continue;
6007
				}
6008
				ret = btrfs_wait_block_group_cache_done(cache);
6009
				if (ret) {
6010 6011 6012
					bg_failed++;
					bg_ret = ret;
					continue;
6013
				}
6014 6015 6016 6017 6018 6019 6020 6021 6022
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
6023 6024 6025
				bg_failed++;
				bg_ret = ret;
				continue;
6026 6027 6028 6029
			}
		}
	}

6030 6031 6032 6033
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
6034
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
6035 6036
	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
6037
		ret = btrfs_trim_free_extents(device, &group_trimmed);
6038 6039 6040
		if (ret) {
			dev_failed++;
			dev_ret = ret;
6041
			break;
6042
		}
6043 6044 6045

		trimmed += group_trimmed;
	}
6046
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
6047

6048 6049 6050 6051
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
6052
	range->len = trimmed;
6053 6054 6055
	if (bg_ret)
		return bg_ret;
	return dev_ret;
6056
}