extent-tree.c 162.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "misc.h"
20
#include "tree-log.h"
21 22
#include "disk-io.h"
#include "print-tree.h"
23
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "locking.h"
26
#include "free-space-cache.h"
27
#include "free-space-tree.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35
#include "discard.h"
36
#include "rcu-string.h"
37
#include "zoned.h"
38
#include "dev-replace.h"
39

40 41
#undef SCRAMBLE_DELAYED_REFS

42

43
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
44 45 46 47
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
48 49 50 51 52 53 54 55
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
56
				     struct btrfs_delayed_ref_node *node,
57
				     struct btrfs_delayed_extent_op *extent_op);
58 59
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
60

61
static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
J
Josef Bacik 已提交
62 63 64 65
{
	return (cache->flags & bits) == bits;
}

66 67
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
68
{
69
	u64 end = start + num_bytes - 1;
70 71
	set_extent_bits(&fs_info->excluded_extents, start, end,
			EXTENT_UPTODATE);
72 73
	return 0;
}
J
Josef Bacik 已提交
74

75
void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
76
{
77
	struct btrfs_fs_info *fs_info = cache->fs_info;
78
	u64 start, end;
J
Josef Bacik 已提交
79

80 81
	start = cache->start;
	end = start + cache->length - 1;
82

83 84
	clear_extent_bits(&fs_info->excluded_extents, start, end,
			  EXTENT_UPTODATE);
J
Josef Bacik 已提交
85 86
}

87
/* simple helper to search for an existing data extent at a given offset */
88
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
89 90 91
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
92
	struct btrfs_path *path;
93

Z
Zheng Yan 已提交
94
	path = btrfs_alloc_path();
95 96 97
	if (!path)
		return -ENOMEM;

98 99
	key.objectid = start;
	key.offset = len;
100
	key.type = BTRFS_EXTENT_ITEM_KEY;
101
	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
Z
Zheng Yan 已提交
102
	btrfs_free_path(path);
103 104 105
	return ret;
}

106
/*
107
 * helper function to lookup reference count and flags of a tree block.
108 109 110 111 112 113 114 115
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
116
			     struct btrfs_fs_info *fs_info, u64 bytenr,
117
			     u64 offset, int metadata, u64 *refs, u64 *flags)
118 119 120 121 122 123 124 125 126 127 128 129
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

130 131 132 133
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
134 135
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
136 137 138
		metadata = 0;
	}

139 140 141 142 143 144 145 146
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
147 148 149 150 151 152 153 154 155

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

156
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
157 158 159
	if (ret < 0)
		goto out_free;

160
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
161 162 163 164 165 166
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
167
			    key.offset == fs_info->nodesize)
168 169
				ret = 0;
		}
170 171
	}

172 173 174 175 176 177 178 179 180
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
181 182 183 184 185 186 187 188
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
189
		}
190

191 192 193 194 195 196 197 198 199 200 201 202
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
203
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
204 205
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
206
			refcount_inc(&head->refs);
207 208
			spin_unlock(&delayed_refs->lock);

209
			btrfs_release_path(path);
210

211 212 213 214
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
215 216
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
217
			btrfs_put_delayed_ref_head(head);
218
			goto search_again;
219
		}
220
		spin_lock(&head->lock);
221 222 223 224 225
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

226
		num_refs += head->ref_mod;
227
		spin_unlock(&head->lock);
228 229 230 231 232 233 234 235 236 237 238 239 240 241
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
274
 * When a tree block is COWed through a tree, there are four cases:
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
301 302 303
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
304
 * - different files inside a single subvolume
305 306
 * - different offsets inside a file (bookend extents in file.c)
 *
307
 * The extent ref structure for the implicit back refs has fields for:
308 309 310
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
311 312
 * - original offset in the file
 * - how many bookend extents
313
 *
314 315
 * The key offset for the implicit back refs is hash of the first
 * three fields.
316
 *
317
 * The extent ref structure for the full back refs has field for:
318
 *
319
 * - number of pointers in the tree leaf
320
 *
321 322
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
323
 *
324 325
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
326
 *
327
 *     (root_key.objectid, inode objectid, offset in file, 1)
328
 *
329 330
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
331
 *
332
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
333
 *
334
 * Btree extents can be referenced by:
335
 *
336
 * - Different subvolumes
337
 *
338 339 340 341
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
342
 *
343 344 345
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
346
 */
Z
Zheng Yan 已提交
347

348 349
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
350
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
351 352 353 354 355 356 357
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
358
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
359 360 361 362 363 364

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
365
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
366
				return type;
367 368 369
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
370 371
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
372 373
				 */
				if (offset &&
374
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
375 376
					return type;
			}
377
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
378
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
379
				return type;
380 381 382
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
383 384
				 * Every shared one has parent tree block,
				 * which must be aligned to sector size.
385 386
				 */
				if (offset &&
387
				    IS_ALIGNED(offset, eb->fs_info->sectorsize))
388 389
					return type;
			}
390 391 392 393 394 395 396
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
397 398 399
	btrfs_err(eb->fs_info,
		  "eb %llu iref 0x%lx invalid extent inline ref type %d",
		  eb->start, (unsigned long)iref, type);
400 401 402 403 404
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

405
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
406 407 408 409 410 411
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
412
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
413
	lenum = cpu_to_le64(owner);
414
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
415
	lenum = cpu_to_le64(offset);
416
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
446
	struct btrfs_root *root = trans->fs_info->extent_root;
447 448
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
449
	struct extent_buffer *leaf;
450
	u32 nritems;
451
	int ret;
452 453
	int recow;
	int err = -ENOENT;
454

Z
Zheng Yan 已提交
455
	key.objectid = bytenr;
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
471

472 473 474 475
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
476 477 478
	}

	leaf = path->nodes[0];
479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
504
				btrfs_release_path(path);
505 506 507 508 509 510
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
511
	}
512 513
fail:
	return err;
Z
Zheng Yan 已提交
514 515
}

516 517 518 519 520
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
521
{
522
	struct btrfs_root *root = trans->fs_info->extent_root;
Z
Zheng Yan 已提交
523 524
	struct btrfs_key key;
	struct extent_buffer *leaf;
525
	u32 size;
Z
Zheng Yan 已提交
526 527
	u32 num_refs;
	int ret;
528 529

	key.objectid = bytenr;
530 531 532 533 534 535 536 537 538 539
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
540

541 542 543 544 545 546 547
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
548
		ref = btrfs_item_ptr(leaf, path->slots[0],
549 550 551 552 553 554 555
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
556
		}
557 558 559 560 561 562 563 564
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
565
			btrfs_release_path(path);
566 567 568 569 570
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
571

572 573 574 575 576 577 578 579 580 581 582 583 584 585
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
586 587
		}
	}
588 589 590
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
591
	btrfs_release_path(path);
592
	return ret;
593 594
}

595 596
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
597
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
598
{
599 600 601
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
602
	struct extent_buffer *leaf;
603
	u32 num_refs = 0;
Z
Zheng Yan 已提交
604 605 606
	int ret = 0;

	leaf = path->nodes[0];
607 608 609 610 611 612 613 614 615 616
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
617
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
618 619 620
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
621 622 623 624
	} else {
		BUG();
	}

625 626
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
627

Z
Zheng Yan 已提交
628
	if (num_refs == 0) {
629
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
J
Josef Bacik 已提交
630
		*last_ref = 1;
Z
Zheng Yan 已提交
631
	} else {
632 633 634 635
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
636 637 638 639 640
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

641
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
642
					  struct btrfs_extent_inline_ref *iref)
643
{
644 645 646 647 648
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
649
	int type;
650 651 652

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
653 654

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
655
	if (iref) {
656 657 658 659 660 661 662
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
682

683 684 685 686
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
687
{
688
	struct btrfs_root *root = trans->fs_info->extent_root;
689
	struct btrfs_key key;
690 691
	int ret;

692 693 694 695 696 697 698
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
699 700
	}

701 702 703 704
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
705 706
}

707 708 709 710
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
711
{
712
	struct btrfs_key key;
Z
Zheng Yan 已提交
713 714
	int ret;

715 716 717 718 719 720 721 722 723
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

724
	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
725
				      path, &key, 0);
726
	btrfs_release_path(path);
Z
Zheng Yan 已提交
727 728 729
	return ret;
}

730
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
731
{
732 733 734 735 736 737 738 739 740 741 742 743 744
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
745
}
746

747 748
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
749

C
Chris Mason 已提交
750
{
751
	for (; level < BTRFS_MAX_LEVEL; level++) {
752 753 754 755 756 757 758 759 760 761 762 763 764 765 766
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
767

768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
789
	struct btrfs_fs_info *fs_info = trans->fs_info;
790
	struct btrfs_root *root = fs_info->extent_root;
791 792 793 794 795 796 797 798 799 800 801 802 803
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
804
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
805
	int needed;
806

807
	key.objectid = bytenr;
Z
Zheng Yan 已提交
808
	key.type = BTRFS_EXTENT_ITEM_KEY;
809
	key.offset = num_bytes;
Z
Zheng Yan 已提交
810

811 812 813
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
814
		path->search_for_extension = 1;
815
		path->keep_locks = 1;
816 817
	} else
		extra_size = -1;
818 819

	/*
820 821
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
822 823 824 825 826 827 828
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
829
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
830
	if (ret < 0) {
831 832 833
		err = ret;
		goto out;
	}
834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
851
			key.objectid = bytenr;
852 853 854 855 856 857 858
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

859 860 861
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
862
	} else if (WARN_ON(ret)) {
863 864
		err = -EIO;
		goto out;
865
	}
866 867 868

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
869
	if (unlikely(item_size < sizeof(*ei))) {
870 871 872 873 874
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
875 876 877 878 879 880 881

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

882
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
883 884 885 886
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

887 888 889 890 891
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

892 893 894 895 896 897 898
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
899 900
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
901
			err = -EUCLEAN;
902 903 904
			goto out;
		}

905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
956 957
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
958
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
959 960 961 962 963 964
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
965
	if (insert) {
966
		path->keep_locks = 0;
967
		path->search_for_extension = 0;
968 969 970 971 972 973 974 975 976
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
977
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
978 979 980 981 982
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1000
	btrfs_extend_item(path, size);
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1045 1046 1047
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1048
	if (ret != -ENOENT)
1049
		return ret;
1050

1051
	btrfs_release_path(path);
1052 1053 1054
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1055 1056
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1057
	} else {
1058 1059
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1060
	}
1061 1062
	return ret;
}
Z
Zheng Yan 已提交
1063

1064 1065 1066 1067
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1068
void update_inline_extent_backref(struct btrfs_path *path,
1069 1070
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1071 1072
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1073
{
1074
	struct extent_buffer *leaf = path->nodes[0];
1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1093 1094 1095 1096 1097 1098
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1099 1100 1101 1102 1103 1104 1105 1106 1107 1108

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1109
	}
Z
Zheng Yan 已提交
1110

1111 1112 1113 1114 1115 1116 1117 1118 1119
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1120
		*last_ref = 1;
1121 1122 1123 1124 1125 1126 1127 1128
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1129
		btrfs_truncate_item(path, item_size, 1);
1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1145 1146 1147
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1148
	if (ret == 0) {
1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
		/*
		 * We're adding refs to a tree block we already own, this
		 * should not happen at all.
		 */
		if (owner < BTRFS_FIRST_FREE_OBJECTID) {
			btrfs_crit(trans->fs_info,
"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu",
				   bytenr, num_bytes, root_objectid);
			if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
				WARN_ON(1);
				btrfs_crit(trans->fs_info,
			"path->slots[0]=%d path->nodes[0]:", path->slots[0]);
				btrfs_print_leaf(path->nodes[0]);
			}
			return -EUCLEAN;
		}
1165 1166
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1167
	} else if (ret == -ENOENT) {
1168
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1169 1170 1171
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1172
	}
1173 1174
	return ret;
}
Z
Zheng Yan 已提交
1175

1176 1177 1178
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1179
				 int refs_to_drop, int is_data, int *last_ref)
1180
{
1181
	int ret = 0;
1182

1183 1184
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1185 1186
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1187
	} else if (is_data) {
1188
		ret = remove_extent_data_ref(trans, path, refs_to_drop,
J
Josef Bacik 已提交
1189
					     last_ref);
1190
	} else {
J
Josef Bacik 已提交
1191
		*last_ref = 1;
1192
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1193 1194 1195 1196
	}
	return ret;
}

1197 1198
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1199
{
1200 1201
	int j, ret = 0;
	u64 bytes_left, end;
1202
	u64 aligned_start = ALIGN(start, 1 << 9);
1203

1204 1205 1206 1207 1208
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1209

1210
	*discarded_bytes = 0;
1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1262 1263
					   GFP_NOFS, 0);
		if (!ret)
1264
			*discarded_bytes += bytes_left;
1265
	}
1266
	return ret;
1267 1268
}

1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308
static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes)
{
	struct btrfs_device *dev = stripe->dev;
	struct btrfs_fs_info *fs_info = dev->fs_info;
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	u64 phys = stripe->physical;
	u64 len = stripe->length;
	u64 discarded = 0;
	int ret = 0;

	/* Zone reset on a zoned filesystem */
	if (btrfs_can_zone_reset(dev, phys, len)) {
		u64 src_disc;

		ret = btrfs_reset_device_zone(dev, phys, len, &discarded);
		if (ret)
			goto out;

		if (!btrfs_dev_replace_is_ongoing(dev_replace) ||
		    dev != dev_replace->srcdev)
			goto out;

		src_disc = discarded;

		/* Send to replace target as well */
		ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len,
					      &discarded);
		discarded += src_disc;
	} else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) {
		ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded);
	} else {
		ret = 0;
		*bytes = 0;
	}

out:
	*bytes = discarded;
	return ret;
}

1309
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1310
			 u64 num_bytes, u64 *actual_bytes)
1311
{
1312
	int ret = 0;
1313
	u64 discarded_bytes = 0;
1314 1315
	u64 end = bytenr + num_bytes;
	u64 cur = bytenr;
1316
	struct btrfs_bio *bbio = NULL;
1317

C
Christoph Hellwig 已提交
1318

1319 1320 1321 1322
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
1323
	btrfs_bio_counter_inc_blocked(fs_info);
1324 1325
	while (cur < end) {
		struct btrfs_bio_stripe *stripe;
1326 1327
		int i;

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
		num_bytes = end - cur;
		/* Tell the block device(s) that the sectors can be discarded */
		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
				      &num_bytes, &bbio, 0);
		/*
		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
		 * thus we can't continue anyway.
		 */
		if (ret < 0)
			goto out;
1339

1340
		stripe = bbio->stripes;
1341
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1342
			u64 bytes;
1343

1344 1345 1346 1347
			if (!stripe->dev->bdev) {
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1348

1349
			ret = do_discard_extent(stripe, &bytes);
1350
			if (!ret) {
1351
				discarded_bytes += bytes;
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362
			} else if (ret != -EOPNOTSUPP) {
				/*
				 * Logic errors or -ENOMEM, or -EIO, but
				 * unlikely to happen.
				 *
				 * And since there are two loops, explicitly
				 * go to out to avoid confusion.
				 */
				btrfs_put_bbio(bbio);
				goto out;
			}
1363 1364 1365 1366 1367 1368 1369

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1370
		}
1371
		btrfs_put_bbio(bbio);
1372
		cur += num_bytes;
1373
	}
1374
out:
1375
	btrfs_bio_counter_dec(fs_info);
1376 1377 1378 1379

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1380

D
David Woodhouse 已提交
1381 1382
	if (ret == -EOPNOTSUPP)
		ret = 0;
1383 1384 1385
	return ret;
}

1386
/* Can return -ENOMEM */
1387
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1388
			 struct btrfs_ref *generic_ref)
1389
{
1390
	struct btrfs_fs_info *fs_info = trans->fs_info;
1391
	int ret;
A
Arne Jansen 已提交
1392

1393 1394 1395 1396
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1397

1398
	if (generic_ref->type == BTRFS_REF_METADATA)
1399
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL);
1400
	else
1401
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0);
1402

1403
	btrfs_ref_tree_mod(fs_info, generic_ref);
1404

1405 1406 1407
	return ret;
}

1408 1409 1410
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
1411 1412 1413
 * The counterpart is in __btrfs_free_extent(), with examples and more details
 * how it works.
 *
1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1445
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1446
				  struct btrfs_delayed_ref_node *node,
1447 1448 1449 1450 1451 1452 1453
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1454
	struct btrfs_key key;
1455 1456
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1457 1458 1459 1460 1461 1462 1463 1464
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* this will setup the path even if it fails to insert the back ref */
1465 1466 1467
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1468
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1469
		goto out;
J
Josef Bacik 已提交
1470 1471 1472 1473 1474 1475

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1476
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1477
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1478 1479 1480 1481 1482
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1483

1484
	btrfs_mark_buffer_dirty(leaf);
1485
	btrfs_release_path(path);
1486 1487

	/* now insert the actual backref */
1488 1489 1490 1491 1492 1493 1494 1495 1496
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
	} else {
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
	}
1497
	if (ret)
1498
		btrfs_abort_transaction(trans, ret);
1499
out:
1500
	btrfs_free_path(path);
1501
	return ret;
1502 1503
}

1504 1505 1506 1507
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1508
{
1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1521
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1522

1523 1524
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1525
	ref_root = ref->root;
1526 1527

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1528
		if (extent_op)
1529
			flags |= extent_op->flags_to_set;
1530 1531 1532 1533
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1534
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1535 1536 1537
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1538
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1539
		ret = __btrfs_free_extent(trans, node, parent,
1540 1541
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1542
					  extent_op);
1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1568
				 struct btrfs_delayed_ref_head *head,
1569 1570
				 struct btrfs_delayed_extent_op *extent_op)
{
1571
	struct btrfs_fs_info *fs_info = trans->fs_info;
1572 1573 1574 1575 1576
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1577
	int ret;
1578
	int err = 0;
1579
	int metadata = !extent_op->is_data;
1580

1581
	if (TRANS_ABORTED(trans))
1582 1583
		return 0;

1584
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1585 1586
		metadata = 0;

1587 1588 1589 1590
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1591
	key.objectid = head->bytenr;
1592

1593 1594
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1595
		key.offset = extent_op->level;
1596 1597
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1598
		key.offset = head->num_bytes;
1599 1600 1601
	}

again:
1602
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1603 1604 1605 1606 1607
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1608
		if (metadata) {
1609 1610 1611 1612
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1613
				if (key.objectid == head->bytenr &&
1614
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1615
				    key.offset == head->num_bytes)
1616 1617 1618 1619 1620
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1621

1622 1623
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1624 1625 1626 1627 1628 1629
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1630
		}
1631 1632 1633 1634
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1635

1636
	if (unlikely(item_size < sizeof(*ei))) {
1637 1638 1639 1640 1641 1642
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1643 1644
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1645

1646 1647 1648 1649
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1650 1651
}

1652 1653 1654 1655
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1656 1657
{
	int ret = 0;
1658 1659 1660
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1661

1662
	ref = btrfs_delayed_node_to_tree_ref(node);
1663
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1664

1665 1666
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1667
	ref_root = ref->root;
1668

1669
	if (node->ref_mod != 1) {
1670
		btrfs_err(trans->fs_info,
1671 1672 1673 1674 1675
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1676
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1677
		BUG_ON(!extent_op || !extent_op->update_flags);
1678
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1679
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1680 1681
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1682
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1683
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1684
					  ref->level, 0, 1, extent_op);
1685 1686 1687
	} else {
		BUG();
	}
1688 1689 1690 1691
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1692 1693 1694 1695
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1696
{
1697 1698
	int ret = 0;

1699
	if (TRANS_ABORTED(trans)) {
1700
		if (insert_reserved)
1701
			btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1702
		return 0;
1703
	}
1704

1705 1706
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1707
		ret = run_delayed_tree_ref(trans, node, extent_op,
1708 1709 1710
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1711
		ret = run_delayed_data_ref(trans, node, extent_op,
1712 1713 1714
					   insert_reserved);
	else
		BUG();
1715
	if (ret && insert_reserved)
1716
		btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1);
1717
	return ret;
1718 1719
}

1720
static inline struct btrfs_delayed_ref_node *
1721 1722
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1723 1724
	struct btrfs_delayed_ref_node *ref;

1725
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1726
		return NULL;
1727

1728 1729 1730 1731 1732 1733
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1734 1735 1736 1737
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1738
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1739
		       struct btrfs_delayed_ref_node, ref_node);
1740 1741
	ASSERT(list_empty(&ref->add_list));
	return ref;
1742 1743
}

1744 1745 1746 1747 1748 1749 1750 1751 1752 1753
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1754 1755
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1756 1757 1758 1759
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1760 1761
		return NULL;

1762
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1763
		head->extent_op = NULL;
1764
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1765
		return NULL;
1766
	}
J
Josef Bacik 已提交
1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1780
	spin_unlock(&head->lock);
1781
	ret = run_delayed_extent_op(trans, head, extent_op);
1782 1783 1784 1785
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1786 1787 1788
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1789
{
J
Josef Bacik 已提交
1790
	int nr_items = 1;	/* Dropping this ref head update. */
1791

1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810
	/*
	 * We had csum deletions accounted for in our delayed refs rsv, we need
	 * to drop the csum leaves for this update from our delayed_refs_rsv.
	 */
	if (head->total_ref_mod < 0 && head->is_data) {
		spin_lock(&delayed_refs->lock);
		delayed_refs->pending_csums -= head->num_bytes;
		spin_unlock(&delayed_refs->lock);
		nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes);
	}

	/*
	 * We were dropping refs, or had a new ref and dropped it, and thus must
	 * adjust down our total_bytes_pinned, the space may or may not have
	 * been pinned and so is accounted for properly in the pinned space by
	 * now.
	 */
	if (head->total_ref_mod < 0 ||
	    (head->total_ref_mod == 0 && head->must_insert_reserved)) {
1811
		u64 flags = btrfs_ref_head_to_space_flags(head);
1812

1813
		btrfs_mod_total_bytes_pinned(fs_info, flags, -head->num_bytes);
1814 1815
	}

J
Josef Bacik 已提交
1816
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1817 1818
}

1819 1820 1821
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1822 1823

	struct btrfs_fs_info *fs_info = trans->fs_info;
1824 1825 1826 1827 1828
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1829
	ret = run_and_cleanup_extent_op(trans, head);
1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1845
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1846 1847 1848 1849
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1850
	btrfs_delete_ref_head(delayed_refs, head);
1851
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1852
	spin_unlock(&delayed_refs->lock);
1853 1854

	if (head->must_insert_reserved) {
1855
		btrfs_pin_extent(trans, head->bytenr, head->num_bytes, 1);
1856
		if (head->is_data) {
1857 1858
			ret = btrfs_del_csums(trans, fs_info->csum_root,
					      head->bytenr, head->num_bytes);
1859 1860 1861
		}
	}

1862
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1863 1864

	trace_run_delayed_ref_head(fs_info, head, 0);
1865
	btrfs_delayed_ref_unlock(head);
1866
	btrfs_put_delayed_ref_head(head);
1867 1868 1869
	return 0;
}

1870 1871 1872 1873 1874 1875 1876 1877 1878
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1879
	head = btrfs_select_ref_head(delayed_refs);
1880 1881 1882 1883 1884 1885 1886 1887 1888
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1889
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1916 1917 1918
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1983 1984 1985 1986
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
1987 1988
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
1989
{
1990
	struct btrfs_fs_info *fs_info = trans->fs_info;
1991 1992
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
1993
	ktime_t start = ktime_get();
1994
	int ret;
1995
	unsigned long count = 0;
1996
	unsigned long actual_count = 0;
1997 1998

	delayed_refs = &trans->transaction->delayed_refs;
1999
	do {
2000
		if (!locked_ref) {
2001
			locked_ref = btrfs_obtain_ref_head(trans);
2002 2003 2004 2005 2006 2007
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2008
			}
2009
			count++;
2010
		}
2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2023
		spin_lock(&locked_ref->lock);
2024
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2025

2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2039
			ret = cleanup_ref_head(trans, locked_ref);
2040
			if (ret > 0 ) {
2041 2042
				/* We dropped our lock, we need to loop. */
				ret = 0;
2043
				continue;
2044 2045
			} else if (ret) {
				return ret;
2046
			}
2047
		}
2048

2049
		/*
2050 2051
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2052 2053
		 */

2054
		locked_ref = NULL;
2055
		cond_resched();
2056
	} while ((nr != -1 && count < nr) || locked_ref);
2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2073
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2074 2075
		spin_unlock(&delayed_refs->lock);
	}
2076
	return 0;
2077 2078
}

2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2122 2123 2124 2125 2126 2127
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2128 2129 2130
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2131 2132
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2133
			   unsigned long count)
2134
{
2135
	struct btrfs_fs_info *fs_info = trans->fs_info;
2136 2137
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2138
	struct btrfs_delayed_ref_head *head;
2139 2140 2141
	int ret;
	int run_all = count == (unsigned long)-1;

2142
	/* We'll clean this up in btrfs_cleanup_transaction */
2143
	if (TRANS_ABORTED(trans))
2144 2145
		return 0;

2146
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2147 2148
		return 0;

2149
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2150
	if (count == 0)
2151
		count = delayed_refs->num_heads_ready;
2152

2153
again:
2154 2155 2156
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2157
	ret = __btrfs_run_delayed_refs(trans, count);
2158
	if (ret < 0) {
2159
		btrfs_abort_transaction(trans, ret);
2160
		return ret;
2161
	}
2162

2163
	if (run_all) {
2164
		btrfs_create_pending_block_groups(trans);
2165

2166
		spin_lock(&delayed_refs->lock);
2167
		node = rb_first_cached(&delayed_refs->href_root);
2168 2169
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2170
			goto out;
2171
		}
2172 2173 2174 2175
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2176

2177 2178 2179
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2180

2181
		btrfs_put_delayed_ref_head(head);
2182
		cond_resched();
2183
		goto again;
2184
	}
2185
out:
2186 2187 2188
	return 0;
}

2189
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2190
				struct extent_buffer *eb, u64 flags,
2191
				int level, int is_data)
2192 2193 2194 2195
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2196
	extent_op = btrfs_alloc_delayed_extent_op();
2197 2198 2199 2200
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2201 2202 2203
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2204
	extent_op->level = level;
2205

2206
	ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
2207
	if (ret)
2208
		btrfs_free_delayed_extent_op(extent_op);
2209 2210 2211
	return ret;
}

2212
static noinline int check_delayed_ref(struct btrfs_root *root,
2213 2214 2215 2216 2217 2218 2219
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2220
	struct btrfs_transaction *cur_trans;
2221
	struct rb_node *node;
2222 2223
	int ret = 0;

2224
	spin_lock(&root->fs_info->trans_lock);
2225
	cur_trans = root->fs_info->running_transaction;
2226 2227 2228
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2229 2230 2231 2232
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2233
	spin_lock(&delayed_refs->lock);
2234
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2235 2236
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2237
		btrfs_put_transaction(cur_trans);
2238 2239
		return 0;
	}
2240 2241

	if (!mutex_trylock(&head->mutex)) {
2242
		refcount_inc(&head->refs);
2243 2244
		spin_unlock(&delayed_refs->lock);

2245
		btrfs_release_path(path);
2246

2247 2248 2249 2250
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2251 2252
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2253
		btrfs_put_delayed_ref_head(head);
2254
		btrfs_put_transaction(cur_trans);
2255 2256
		return -EAGAIN;
	}
2257
	spin_unlock(&delayed_refs->lock);
2258

2259
	spin_lock(&head->lock);
2260 2261 2262 2263
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2264 2265
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2266
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2267 2268 2269 2270 2271
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2272

2273
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2274

2275 2276 2277 2278 2279 2280 2281 2282 2283 2284
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2285
	}
2286
	spin_unlock(&head->lock);
2287
	mutex_unlock(&head->mutex);
2288
	btrfs_put_transaction(cur_trans);
2289 2290 2291
	return ret;
}

2292
static noinline int check_committed_ref(struct btrfs_root *root,
2293
					struct btrfs_path *path,
2294 2295
					u64 objectid, u64 offset, u64 bytenr,
					bool strict)
2296
{
2297 2298
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
2299
	struct extent_buffer *leaf;
2300 2301 2302
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2303
	struct btrfs_key key;
2304
	u32 item_size;
2305
	int type;
2306
	int ret;
2307

2308
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2309
	key.offset = (u64)-1;
2310
	key.type = BTRFS_EXTENT_ITEM_KEY;
2311 2312 2313 2314

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2315
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2316 2317 2318

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2319
		goto out;
2320

Z
Zheng Yan 已提交
2321
	path->slots[0]--;
2322
	leaf = path->nodes[0];
2323
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2324

2325
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2326
		goto out;
2327

2328 2329 2330
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2331

2332
	/* If extent item has more than 1 inline ref then it's shared */
2333 2334 2335
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2336

2337 2338 2339 2340 2341 2342 2343
	/*
	 * If extent created before last snapshot => it's shared unless the
	 * snapshot has been deleted. Use the heuristic if strict is false.
	 */
	if (!strict &&
	    (btrfs_extent_generation(leaf, ei) <=
	     btrfs_root_last_snapshot(&root->root_item)))
2344 2345 2346
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2347

2348
	/* If this extent has SHARED_DATA_REF then it's shared */
2349 2350
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2367
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2368
			  u64 bytenr, bool strict)
2369 2370 2371 2372 2373 2374
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2375
		return -ENOMEM;
2376 2377

	do {
2378
		ret = check_committed_ref(root, path, objectid,
2379
					  offset, bytenr, strict);
2380
		if (ret && ret != -ENOENT)
2381
			goto out;
Y
Yan Zheng 已提交
2382

2383 2384
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2385

2386
out:
Y
Yan Zheng 已提交
2387
	btrfs_free_path(path);
2388 2389
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
2390
	return ret;
2391
}
C
Chris Mason 已提交
2392

2393
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2394
			   struct btrfs_root *root,
2395
			   struct extent_buffer *buf,
2396
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2397
{
2398
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2399
	u64 bytenr;
2400 2401
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2402 2403 2404 2405
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2406 2407
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2408
	int i;
2409
	int action;
Z
Zheng Yan 已提交
2410 2411
	int level;
	int ret = 0;
2412

2413
	if (btrfs_is_testing(fs_info))
2414
		return 0;
2415

Z
Zheng Yan 已提交
2416 2417 2418 2419
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2420
	if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && level == 0)
2421
		return 0;
Z
Zheng Yan 已提交
2422

2423 2424 2425 2426
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2427 2428 2429 2430
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2431 2432

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2433
		if (level == 0) {
2434
			btrfs_item_key_to_cpu(buf, &key, i);
2435
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2436
				continue;
2437
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2438 2439 2440 2441 2442 2443 2444
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2445 2446 2447

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2448 2449 2450 2451 2452 2453
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
					    key.offset);
			generic_ref.skip_qgroup = for_reloc;
2454
			if (inc)
2455
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2456
			else
2457
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2458 2459 2460
			if (ret)
				goto fail;
		} else {
2461
			bytenr = btrfs_node_blockptr(buf, i);
2462
			num_bytes = fs_info->nodesize;
2463 2464 2465 2466 2467
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
			generic_ref.skip_qgroup = for_reloc;
2468
			if (inc)
2469
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2470
			else
2471
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2472 2473 2474 2475 2476 2477
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2478 2479 2480 2481
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2482
		  struct extent_buffer *buf, int full_backref)
2483
{
2484
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2485 2486 2487
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2488
		  struct extent_buffer *buf, int full_backref)
2489
{
2490
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2491 2492
}

2493
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2494
{
2495
	struct btrfs_block_group *block_group;
2496 2497
	int readonly = 0;

2498
	block_group = btrfs_lookup_block_group(fs_info, bytenr);
2499 2500 2501
	if (!block_group || block_group->ro)
		readonly = 1;
	if (block_group)
2502
		btrfs_put_block_group(block_group);
2503 2504 2505
	return readonly;
}

2506
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2507
{
2508
	struct btrfs_fs_info *fs_info = root->fs_info;
2509
	u64 flags;
D
David Woodhouse 已提交
2510
	u64 ret;
J
Josef Bacik 已提交
2511

2512 2513
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2514
	else if (root == fs_info->chunk_root)
2515
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2516
	else
2517
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2518

2519
	ret = btrfs_get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2520
	return ret;
J
Josef Bacik 已提交
2521
}
J
Josef Bacik 已提交
2522

2523
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2524
{
2525
	struct btrfs_block_group *cache;
2526
	u64 bytenr;
J
Josef Bacik 已提交
2527

2528 2529 2530
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2531 2532 2533 2534

	if (bytenr < (u64)-1)
		return bytenr;

2535
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2536
	if (!cache)
2537
		return 0;
J
Josef Bacik 已提交
2538

2539
	bytenr = cache->start;
2540
	btrfs_put_block_group(cache);
2541 2542

	return bytenr;
2543 2544
}

2545 2546
static int pin_down_extent(struct btrfs_trans_handle *trans,
			   struct btrfs_block_group *cache,
2547
			   u64 bytenr, u64 num_bytes, int reserved)
2548
{
2549 2550
	struct btrfs_fs_info *fs_info = cache->fs_info;

2551 2552 2553
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2554 2555
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2556 2557 2558 2559 2560 2561
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2562

2563
	__btrfs_mod_total_bytes_pinned(cache->space_info, num_bytes);
2564
	set_extent_dirty(&trans->transaction->pinned_extents, bytenr,
2565 2566 2567
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2568

2569
int btrfs_pin_extent(struct btrfs_trans_handle *trans,
2570 2571
		     u64 bytenr, u64 num_bytes, int reserved)
{
2572
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
2573

2574
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2575
	BUG_ON(!cache); /* Logic error */
2576

2577
	pin_down_extent(trans, cache, bytenr, num_bytes, reserved);
2578 2579

	btrfs_put_block_group(cache);
2580 2581 2582
	return 0;
}

2583
/*
2584 2585
 * this function must be called within transaction
 */
2586
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2587 2588
				    u64 bytenr, u64 num_bytes)
{
2589
	struct btrfs_block_group *cache;
2590
	int ret;
2591

2592
	cache = btrfs_lookup_block_group(trans->fs_info, bytenr);
2593 2594
	if (!cache)
		return -EINVAL;
2595 2596 2597 2598 2599 2600 2601

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2602
	btrfs_cache_block_group(cache, 1);
2603 2604 2605 2606 2607 2608 2609
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(cache);
	if (ret)
		goto out;
2610

2611
	pin_down_extent(trans, cache, bytenr, num_bytes, 0);
2612 2613

	/* remove us from the free space cache (if we're there at all) */
2614
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2615
out:
2616
	btrfs_put_block_group(cache);
2617
	return ret;
2618 2619
}

2620 2621
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2622 2623
{
	int ret;
2624
	struct btrfs_block_group *block_group;
2625

2626
	block_group = btrfs_lookup_block_group(fs_info, start);
2627 2628 2629
	if (!block_group)
		return -EINVAL;

2630 2631 2632 2633 2634 2635 2636 2637
	btrfs_cache_block_group(block_group, 1);
	/*
	 * Make sure we wait until the cache is completely built in case it is
	 * missing or is invalid and therefore needs to be rebuilt.
	 */
	ret = btrfs_wait_block_group_cache_done(block_group);
	if (ret)
		goto out;
2638

2639 2640
	ret = btrfs_remove_free_space(block_group, start, num_bytes);
out:
2641 2642 2643 2644
	btrfs_put_block_group(block_group);
	return ret;
}

2645
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2646
{
2647
	struct btrfs_fs_info *fs_info = eb->fs_info;
2648 2649 2650 2651
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2652
	int ret = 0;
2653

2654
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2669 2670 2671
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2672 2673
	}

2674
	return ret;
2675 2676
}

2677
static void
2678
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2679 2680 2681 2682
{
	atomic_inc(&bg->reservations);
}

2683 2684 2685 2686 2687
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2688 2689
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2690 2691 2692 2693 2694 2695 2696 2697
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2698
		ret = &fs_info->meta_alloc_cluster;
2699 2700 2701
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2702
			*empty_cluster = SZ_64K;
2703 2704 2705
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2706
		ret = &fs_info->data_alloc_cluster;
2707 2708 2709 2710 2711
	}

	return ret;
}

2712 2713
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2714
			      const bool return_free_space)
C
Chris Mason 已提交
2715
{
2716
	struct btrfs_block_group *cache = NULL;
2717 2718
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2719
	struct btrfs_free_cluster *cluster = NULL;
2720
	u64 len;
2721 2722
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2723
	bool readonly;
C
Chris Mason 已提交
2724

2725
	while (start <= end) {
2726
		readonly = false;
2727
		if (!cache ||
2728
		    start >= cache->start + cache->length) {
2729 2730
			if (cache)
				btrfs_put_block_group(cache);
2731
			total_unpinned = 0;
2732
			cache = btrfs_lookup_block_group(fs_info, start);
2733
			BUG_ON(!cache); /* Logic error */
2734

2735
			cluster = fetch_cluster_info(fs_info,
2736 2737 2738
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2739 2740
		}

2741
		len = cache->start + cache->length - start;
2742 2743
		len = min(len, end + 1 - start);

2744
		down_read(&fs_info->commit_root_sem);
2745 2746 2747 2748
		if (start < cache->last_byte_to_unpin && return_free_space) {
			u64 add_len = min(len, cache->last_byte_to_unpin - start);

			btrfs_add_free_space(cache, start, add_len);
2749
		}
2750
		up_read(&fs_info->commit_root_sem);
2751

2752
		start += len;
2753
		total_unpinned += len;
2754
		space_info = cache->space_info;
2755

2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2769
		spin_lock(&space_info->lock);
2770 2771
		spin_lock(&cache->lock);
		cache->pinned -= len;
2772
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2773
		space_info->max_extent_size = 0;
2774
		__btrfs_mod_total_bytes_pinned(space_info, -len);
2775 2776 2777
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
2778 2779 2780 2781
		} else if (btrfs_is_zoned(fs_info)) {
			/* Need reset before reusing in a zoned block group */
			space_info->bytes_zone_unusable += len;
			readonly = true;
2782
		}
2783
		spin_unlock(&cache->lock);
2784 2785 2786
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2787

2788 2789
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2790 2791 2792
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2793 2794
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2795 2796
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2797
				len -= to_add;
2798 2799 2800
			}
			spin_unlock(&global_rsv->lock);
		}
2801 2802 2803
		/* Add to any tickets we may have */
		if (!readonly && return_free_space && len)
			btrfs_try_granting_tickets(fs_info, space_info);
2804
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2805
	}
2806 2807 2808

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
2809 2810 2811
	return 0;
}

2812
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2813
{
2814
	struct btrfs_fs_info *fs_info = trans->fs_info;
2815
	struct btrfs_block_group *block_group, *tmp;
2816
	struct list_head *deleted_bgs;
2817
	struct extent_io_tree *unpin;
2818 2819
	u64 start;
	u64 end;
2820 2821
	int ret;

2822
	unpin = &trans->transaction->pinned_extents;
2823

2824
	while (!TRANS_ABORTED(trans)) {
2825 2826
		struct extent_state *cached_state = NULL;

2827
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2828
		ret = find_first_extent_bit(unpin, 0, &start, &end,
2829
					    EXTENT_DIRTY, &cached_state);
2830 2831
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2832
			break;
2833
		}
2834

2835
		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2836
			ret = btrfs_discard_extent(fs_info, start,
2837
						   end + 1 - start, NULL);
2838

2839
		clear_extent_dirty(unpin, start, end, &cached_state);
2840
		unpin_extent_range(fs_info, start, end, true);
2841
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2842
		free_extent_state(cached_state);
2843
		cond_resched();
2844
	}
J
Josef Bacik 已提交
2845

2846 2847
	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2848
		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2849
	}
2850

2851 2852 2853 2854 2855 2856 2857 2858 2859 2860
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
2861
		if (!TRANS_ABORTED(trans))
2862
			ret = btrfs_discard_extent(fs_info,
2863 2864
						   block_group->start,
						   block_group->length,
2865 2866 2867
						   &trimmed);

		list_del_init(&block_group->bg_list);
2868
		btrfs_unfreeze_block_group(block_group);
2869 2870 2871 2872 2873
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
2874
			   "discard failed while removing blockgroup: errno=%d %s",
2875 2876 2877 2878
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
2879 2880 2881
	return 0;
}

2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
/*
 * Drop one or more refs of @node.
 *
 * 1. Locate the extent refs.
 *    It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item.
 *    Locate it, then reduce the refs number or remove the ref line completely.
 *
 * 2. Update the refs count in EXTENT/METADATA_ITEM
 *
 * Inline backref case:
 *
 * in extent tree we have:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 2 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *		extent data backref root FS_TREE objectid 257 offset 0 count 1
 *
 * This function gets called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 257
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 * 	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82
 *		refs 1 gen 6 flags DATA
 *		extent data backref root FS_TREE objectid 258 offset 0 count 1
 *
 * Keyed backref case:
 *
 * in extent tree we have:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 754 gen 6 flags DATA
 *	[...]
 *	item 2 key (13631488 EXTENT_DATA_REF <HASH>) itemoff 3915 itemsize 28
 *		extent data backref root FS_TREE objectid 866 offset 0 count 1
 *
 * This function get called with:
 *
 *    node->bytenr = 13631488
 *    node->num_bytes = 1048576
 *    root_objectid = FS_TREE
 *    owner_objectid = 866
 *    owner_offset = 0
 *    refs_to_drop = 1
 *
 * Then we should get some like:
 *
 *	item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24
 *		refs 753 gen 6 flags DATA
 *
 * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed.
 */
2941
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2942 2943 2944 2945
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
2946
{
2947
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
2948
	struct btrfs_key key;
2949
	struct btrfs_path *path;
2950
	struct btrfs_root *extent_root = info->extent_root;
2951
	struct extent_buffer *leaf;
2952 2953
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
2954
	int ret;
2955
	int is_data;
2956 2957 2958
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
2959 2960
	u32 item_size;
	u64 refs;
2961 2962
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
2963
	int last_ref = 0;
2964
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
2965

2966
	path = btrfs_alloc_path();
2967 2968
	if (!path)
		return -ENOMEM;
2969

2970
	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
2971 2972 2973 2974 2975 2976 2977 2978 2979

	if (!is_data && refs_to_drop != 1) {
		btrfs_crit(info,
"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u",
			   node->bytenr, refs_to_drop);
		ret = -EINVAL;
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
2980

2981
	if (is_data)
2982
		skinny_metadata = false;
2983

2984 2985
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
2986
				    owner_offset);
2987
	if (ret == 0) {
2988 2989 2990 2991 2992 2993 2994
		/*
		 * Either the inline backref or the SHARED_DATA_REF/
		 * SHARED_BLOCK_REF is found
		 *
		 * Here is a quick path to locate EXTENT/METADATA_ITEM.
		 * It's possible the EXTENT/METADATA_ITEM is near current slot.
		 */
2995
		extent_slot = path->slots[0];
2996 2997
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
2998
					      extent_slot);
2999
			if (key.objectid != bytenr)
3000
				break;
3001 3002
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
3003 3004 3005
				found_extent = 1;
				break;
			}
3006 3007 3008 3009 3010
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
3011 3012

			/* Quick path didn't find the EXTEMT/METADATA_ITEM */
3013 3014
			if (path->slots[0] - extent_slot > 5)
				break;
3015
			extent_slot--;
3016
		}
3017

Z
Zheng Yan 已提交
3018
		if (!found_extent) {
3019 3020 3021 3022 3023 3024 3025
			if (iref) {
				btrfs_crit(info,
"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
			/* Must be SHARED_* item, remove the backref first */
3026
			ret = remove_extent_backref(trans, path, NULL,
3027
						    refs_to_drop,
J
Josef Bacik 已提交
3028
						    is_data, &last_ref);
3029
			if (ret) {
3030
				btrfs_abort_transaction(trans, ret);
3031 3032
				goto out;
			}
3033
			btrfs_release_path(path);
3034

3035
			/* Slow path to locate EXTENT/METADATA_ITEM */
3036 3037 3038 3039
			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3040 3041 3042 3043 3044
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3045 3046
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3063
				key.objectid = bytenr;
3064 3065 3066 3067 3068 3069 3070
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3071
			if (ret) {
J
Jeff Mahoney 已提交
3072 3073 3074
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3075
				if (ret > 0)
3076
					btrfs_print_leaf(path->nodes[0]);
3077
			}
3078
			if (ret < 0) {
3079
				btrfs_abort_transaction(trans, ret);
3080 3081
				goto out;
			}
Z
Zheng Yan 已提交
3082 3083
			extent_slot = path->slots[0];
		}
3084
	} else if (WARN_ON(ret == -ENOENT)) {
3085
		btrfs_print_leaf(path->nodes[0]);
3086 3087
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3088 3089
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3090
		btrfs_abort_transaction(trans, ret);
3091
		goto out;
3092
	} else {
3093
		btrfs_abort_transaction(trans, ret);
3094
		goto out;
3095
	}
3096 3097

	leaf = path->nodes[0];
3098
	item_size = btrfs_item_size_nr(leaf, extent_slot);
3099
	if (unlikely(item_size < sizeof(*ei))) {
3100 3101 3102 3103 3104
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3105
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3106
			    struct btrfs_extent_item);
3107 3108
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3109
		struct btrfs_tree_block_info *bi;
3110 3111
		if (item_size < sizeof(*ei) + sizeof(*bi)) {
			btrfs_crit(info,
3112
"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu",
3113 3114 3115 3116 3117 3118
				   key.objectid, key.type, key.offset,
				   owner_objectid, item_size,
				   sizeof(*ei) + sizeof(*bi));
			btrfs_abort_transaction(trans, -EUCLEAN);
			goto err_dump;
		}
3119 3120 3121
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3122

3123
	refs = btrfs_extent_refs(leaf, ei);
3124
	if (refs < refs_to_drop) {
3125 3126
		btrfs_crit(info,
		"trying to drop %d refs but we only have %llu for bytenr %llu",
J
Jeff Mahoney 已提交
3127
			  refs_to_drop, refs, bytenr);
3128 3129
		btrfs_abort_transaction(trans, -EUCLEAN);
		goto err_dump;
3130
	}
3131
	refs -= refs_to_drop;
3132

3133 3134 3135 3136 3137 3138
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3139
		 */
3140
		if (iref) {
3141 3142 3143 3144 3145 3146
			if (!found_extent) {
				btrfs_crit(info,
"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found");
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3147 3148 3149 3150 3151
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3152 3153 3154
			ret = remove_extent_backref(trans, path, iref,
						    refs_to_drop, is_data,
						    &last_ref);
3155
			if (ret) {
3156
				btrfs_abort_transaction(trans, ret);
3157 3158
				goto out;
			}
3159
		}
3160
	} else {
3161
		/* In this branch refs == 1 */
3162
		if (found_extent) {
3163 3164 3165 3166 3167 3168 3169 3170 3171
			if (is_data && refs_to_drop !=
			    extent_data_ref_count(path, iref)) {
				btrfs_crit(info,
		"invalid refs_to_drop, current refs %u refs_to_drop %u",
					   extent_data_ref_count(path, iref),
					   refs_to_drop);
				btrfs_abort_transaction(trans, -EUCLEAN);
				goto err_dump;
			}
3172
			if (iref) {
3173 3174 3175 3176 3177 3178 3179 3180
				if (path->slots[0] != extent_slot) {
					btrfs_crit(info,
"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref",
						   key.objectid, key.type,
						   key.offset);
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3181
			} else {
3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193
				/*
				 * No inline ref, we must be at SHARED_* item,
				 * And it's single ref, it must be:
				 * |	extent_slot	  ||extent_slot + 1|
				 * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ]
				 */
				if (path->slots[0] != extent_slot + 1) {
					btrfs_crit(info,
	"invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM");
					btrfs_abort_transaction(trans, -EUCLEAN);
					goto err_dump;
				}
3194 3195 3196
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3197
		}
3198

J
Josef Bacik 已提交
3199
		last_ref = 1;
3200 3201
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3202
		if (ret) {
3203
			btrfs_abort_transaction(trans, ret);
3204 3205
			goto out;
		}
3206
		btrfs_release_path(path);
3207

3208
		if (is_data) {
3209 3210
			ret = btrfs_del_csums(trans, info->csum_root, bytenr,
					      num_bytes);
3211
			if (ret) {
3212
				btrfs_abort_transaction(trans, ret);
3213 3214
				goto out;
			}
3215 3216
		}

3217
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3218
		if (ret) {
3219
			btrfs_abort_transaction(trans, ret);
3220 3221 3222
			goto out;
		}

3223
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3224
		if (ret) {
3225
			btrfs_abort_transaction(trans, ret);
3226 3227
			goto out;
		}
3228
	}
J
Josef Bacik 已提交
3229 3230
	btrfs_release_path(path);

3231
out:
3232
	btrfs_free_path(path);
3233
	return ret;
3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246
err_dump:
	/*
	 * Leaf dump can take up a lot of log buffer, so we only do full leaf
	 * dump for debug build.
	 */
	if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) {
		btrfs_crit(info, "path->slots[0]=%d extent_slot=%d",
			   path->slots[0], extent_slot);
		btrfs_print_leaf(path->nodes[0]);
	}

	btrfs_free_path(path);
	return -EUCLEAN;
3247 3248
}

3249
/*
3250
 * when we free an block, it is possible (and likely) that we free the last
3251 3252 3253 3254 3255
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3256
				      u64 bytenr)
3257 3258 3259
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3260
	int ret = 0;
3261 3262 3263

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3264
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3265
	if (!head)
3266
		goto out_delayed_unlock;
3267

3268
	spin_lock(&head->lock);
3269
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3270 3271
		goto out;

J
Josef Bacik 已提交
3272 3273
	if (cleanup_extent_op(head) != NULL)
		goto out;
3274

3275 3276 3277 3278 3279 3280 3281
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3282
	btrfs_delete_ref_head(delayed_refs, head);
3283
	head->processing = 0;
3284

3285
	spin_unlock(&head->lock);
3286 3287
	spin_unlock(&delayed_refs->lock);

3288 3289 3290 3291
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3292
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3293
	mutex_unlock(&head->mutex);
3294
	btrfs_put_delayed_ref_head(head);
3295
	return ret;
3296
out:
3297
	spin_unlock(&head->lock);
3298 3299

out_delayed_unlock:
3300 3301 3302 3303
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3304 3305 3306
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
3307
			   u64 parent, int last_ref)
3308
{
3309
	struct btrfs_fs_info *fs_info = root->fs_info;
3310
	struct btrfs_ref generic_ref = { 0 };
3311 3312
	int ret;

3313 3314 3315 3316 3317
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
			    root->root_key.objectid);

3318
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3319
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3320
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL);
3321
		BUG_ON(ret); /* -ENOMEM */
3322 3323
	}

3324
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3325
		struct btrfs_block_group *cache;
3326

3327
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3328
			ret = check_ref_cleanup(trans, buf->start);
3329 3330
			if (!ret) {
				btrfs_redirty_list_add(trans->transaction, buf);
3331
				goto out;
3332
			}
3333 3334
		}

3335
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3336

3337
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3338
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
3339
			btrfs_put_block_group(cache);
3340
			goto out;
3341 3342
		}

3343 3344 3345 3346 3347 3348 3349
		if (btrfs_is_zoned(fs_info)) {
			btrfs_redirty_list_add(trans->transaction, buf);
			pin_down_extent(trans, cache, buf->start, buf->len, 1);
			btrfs_put_block_group(cache);
			goto out;
		}

3350 3351 3352
		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3353
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3354
		btrfs_put_block_group(cache);
3355
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3356 3357
	}
out:
3358 3359 3360 3361 3362 3363 3364
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3365 3366
}

3367
/* Can return -ENOMEM */
3368
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3369
{
3370
	struct btrfs_fs_info *fs_info = trans->fs_info;
3371 3372
	int ret;

3373
	if (btrfs_is_testing(fs_info))
3374
		return 0;
3375

3376 3377 3378 3379
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3380 3381 3382 3383
	if ((ref->type == BTRFS_REF_METADATA &&
	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	    (ref->type == BTRFS_REF_DATA &&
	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3384
		/* unlocks the pinned mutex */
3385
		btrfs_pin_extent(trans, ref->bytenr, ref->len, 1);
3386
		ret = 0;
3387
	} else if (ref->type == BTRFS_REF_METADATA) {
3388
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL);
3389
	} else {
3390
		ret = btrfs_add_delayed_data_ref(trans, ref, 0);
3391
	}
3392

3393 3394 3395 3396 3397
	if (!((ref->type == BTRFS_REF_METADATA &&
	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	      (ref->type == BTRFS_REF_DATA &&
	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
		btrfs_ref_tree_mod(fs_info, ref);
3398

3399 3400 3401
	return ret;
}

J
Josef Bacik 已提交
3402
enum btrfs_loop_type {
3403 3404 3405 3406
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3407 3408
};

3409
static inline void
3410
btrfs_lock_block_group(struct btrfs_block_group *cache,
3411 3412 3413 3414 3415 3416
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3417
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3418 3419 3420 3421 3422 3423 3424
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3425 3426
static struct btrfs_block_group *btrfs_lock_cluster(
		   struct btrfs_block_group *block_group,
3427 3428
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
3429
	__acquires(&cluster->refill_lock)
3430
{
3431
	struct btrfs_block_group *used_bg = NULL;
3432

3433
	spin_lock(&cluster->refill_lock);
3434 3435 3436 3437 3438 3439
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3440 3441
			return used_bg;

3442
		btrfs_get_block_group(used_bg);
3443

3444 3445
		if (!delalloc)
			return used_bg;
3446

3447 3448
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3449

3450
		spin_unlock(&cluster->refill_lock);
3451

3452 3453
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3454

3455 3456 3457
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3458

3459 3460 3461
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3462 3463 3464
}

static inline void
3465
btrfs_release_block_group(struct btrfs_block_group *cache,
3466 3467 3468 3469 3470 3471 3472
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3473 3474
enum btrfs_extent_allocation_policy {
	BTRFS_EXTENT_ALLOC_CLUSTERED,
3475
	BTRFS_EXTENT_ALLOC_ZONED,
3476 3477
};

3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
	u64 num_bytes;
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;
3494 3495
	struct btrfs_free_cluster *last_ptr;
	bool use_cluster;
3496 3497 3498 3499

	bool have_caching_bg;
	bool orig_have_caching_bg;

3500 3501 3502
	/* Allocation is called for tree-log */
	bool for_treelog;

3503 3504 3505
	/* RAID index, converted from flags */
	int index;

3506 3507 3508
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
3534

3535 3536 3537
	/* Hint where to start looking for an empty space */
	u64 hint_byte;

3538 3539
	/* Allocation policy */
	enum btrfs_extent_allocation_policy policy;
3540 3541
};

3542 3543 3544 3545 3546 3547 3548 3549 3550

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
3551
static int find_free_extent_clustered(struct btrfs_block_group *bg,
3552 3553
				      struct find_free_extent_ctl *ffe_ctl,
				      struct btrfs_block_group **cluster_bg_ret)
3554
{
3555
	struct btrfs_block_group *cluster_bg;
3556
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3569
			ffe_ctl->num_bytes, cluster_bg->start,
3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3615 3616
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3636
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3650 3651 3652 3653 3654
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
3655
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3656
					struct find_free_extent_ctl *ffe_ctl)
3657
{
3658
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;
3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3703 3704
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3705 3706 3707 3708 3709 3710 3711 3712 3713
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3714 3715 3716 3717 3718 3719 3720 3721
static int do_allocation_clustered(struct btrfs_block_group *block_group,
				   struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_block_group **bg_ret)
{
	int ret;

	/* We want to try and use the cluster allocator, so lets look there */
	if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) {
3722
		ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret);
3723 3724 3725 3726 3727
		if (ret >= 0 || ret == -EAGAIN)
			return ret;
		/* ret == -ENOENT case falls through */
	}

3728
	return find_free_extent_unclustered(block_group, ffe_ctl);
3729 3730
}

3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746
/*
 * Tree-log block group locking
 * ============================
 *
 * fs_info::treelog_bg_lock protects the fs_info::treelog_bg which
 * indicates the starting address of a block group, which is reserved only
 * for tree-log metadata.
 *
 * Lock nesting
 * ============
 *
 * space_info::lock
 *   block_group::lock
 *     fs_info::treelog_bg_lock
 */

3747 3748 3749 3750 3751 3752 3753 3754 3755
/*
 * Simple allocator for sequential-only block group. It only allows sequential
 * allocation. No need to play with trees. This function also reserves the
 * bytes as in btrfs_add_reserved_bytes.
 */
static int do_allocation_zoned(struct btrfs_block_group *block_group,
			       struct find_free_extent_ctl *ffe_ctl,
			       struct btrfs_block_group **bg_ret)
{
3756
	struct btrfs_fs_info *fs_info = block_group->fs_info;
3757 3758 3759 3760 3761
	struct btrfs_space_info *space_info = block_group->space_info;
	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
	u64 start = block_group->start;
	u64 num_bytes = ffe_ctl->num_bytes;
	u64 avail;
3762 3763
	u64 bytenr = block_group->start;
	u64 log_bytenr;
3764
	int ret = 0;
3765
	bool skip;
3766 3767 3768

	ASSERT(btrfs_is_zoned(block_group->fs_info));

3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780
	/*
	 * Do not allow non-tree-log blocks in the dedicated tree-log block
	 * group, and vice versa.
	 */
	spin_lock(&fs_info->treelog_bg_lock);
	log_bytenr = fs_info->treelog_bg;
	skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) ||
			      (!ffe_ctl->for_treelog && bytenr == log_bytenr));
	spin_unlock(&fs_info->treelog_bg_lock);
	if (skip)
		return 1;

3781 3782
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
3783 3784 3785 3786 3787
	spin_lock(&fs_info->treelog_bg_lock);

	ASSERT(!ffe_ctl->for_treelog ||
	       block_group->start == fs_info->treelog_bg ||
	       fs_info->treelog_bg == 0);
3788 3789 3790 3791 3792 3793

	if (block_group->ro) {
		ret = 1;
		goto out;
	}

3794 3795 3796 3797 3798 3799 3800 3801 3802 3803
	/*
	 * Do not allow currently using block group to be tree-log dedicated
	 * block group.
	 */
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg &&
	    (block_group->used || block_group->reserved)) {
		ret = 1;
		goto out;
	}

3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817
	avail = block_group->length - block_group->alloc_offset;
	if (avail < num_bytes) {
		if (ffe_ctl->max_extent_size < avail) {
			/*
			 * With sequential allocator, free space is always
			 * contiguous
			 */
			ffe_ctl->max_extent_size = avail;
			ffe_ctl->total_free_space = avail;
		}
		ret = 1;
		goto out;
	}

3818 3819 3820
	if (ffe_ctl->for_treelog && !fs_info->treelog_bg)
		fs_info->treelog_bg = block_group->start;

3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834
	ffe_ctl->found_offset = start + block_group->alloc_offset;
	block_group->alloc_offset += num_bytes;
	spin_lock(&ctl->tree_lock);
	ctl->free_space -= num_bytes;
	spin_unlock(&ctl->tree_lock);

	/*
	 * We do not check if found_offset is aligned to stripesize. The
	 * address is anyway rewritten when using zone append writing.
	 */

	ffe_ctl->search_start = ffe_ctl->found_offset;

out:
3835 3836 3837
	if (ret && ffe_ctl->for_treelog)
		fs_info->treelog_bg = 0;
	spin_unlock(&fs_info->treelog_bg_lock);
3838 3839 3840 3841 3842
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);
	return ret;
}

3843 3844 3845 3846 3847 3848 3849
static int do_allocation(struct btrfs_block_group *block_group,
			 struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_block_group **bg_ret)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return do_allocation_clustered(block_group, ffe_ctl, bg_ret);
3850 3851
	case BTRFS_EXTENT_ALLOC_ZONED:
		return do_allocation_zoned(block_group, ffe_ctl, bg_ret);
3852 3853 3854 3855 3856
	default:
		BUG();
	}
}

3857 3858 3859 3860 3861 3862 3863 3864 3865
static void release_block_group(struct btrfs_block_group *block_group,
				struct find_free_extent_ctl *ffe_ctl,
				int delalloc)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		ffe_ctl->retry_clustered = false;
		ffe_ctl->retry_unclustered = false;
		break;
3866 3867 3868
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3869 3870 3871 3872 3873 3874 3875 3876 3877
	default:
		BUG();
	}

	BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
	       ffe_ctl->index);
	btrfs_release_block_group(block_group, delalloc);
}

3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896
static void found_extent_clustered(struct find_free_extent_ctl *ffe_ctl,
				   struct btrfs_key *ins)
{
	struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

	if (!ffe_ctl->use_cluster && last_ptr) {
		spin_lock(&last_ptr->lock);
		last_ptr->window_start = ins->objectid;
		spin_unlock(&last_ptr->lock);
	}
}

static void found_extent(struct find_free_extent_ctl *ffe_ctl,
			 struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		found_extent_clustered(ffe_ctl, ins);
		break;
3897 3898 3899
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Nothing to do */
		break;
3900 3901 3902 3903 3904
	default:
		BUG();
	}
}

3905 3906 3907 3908 3909 3910 3911 3912 3913 3914
static int chunk_allocation_failed(struct find_free_extent_ctl *ffe_ctl)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		/*
		 * If we can't allocate a new chunk we've already looped through
		 * at least once, move on to the NO_EMPTY_SIZE case.
		 */
		ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
		return 0;
3915 3916 3917
	case BTRFS_EXTENT_ALLOC_ZONED:
		/* Give up here */
		return -ENOSPC;
3918 3919 3920 3921 3922
	default:
		BUG();
	}
}

3923 3924 3925 3926 3927 3928 3929 3930
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
3931
					bool full_search)
3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947
{
	struct btrfs_root *root = fs_info->extent_root;
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	    ffe_ctl->have_caching_bg)
		return 1;

	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
		return 1;

	if (ins->objectid) {
3948
		found_extent(ffe_ctl, ins);
3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990
		return 0;
	}

	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

3991 3992
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
3993 3994

			/* Do not bail out on ENOSPC since we can do more. */
3995 3996 3997
			if (ret == -ENOSPC)
				ret = chunk_allocation_failed(ffe_ctl);
			else if (ret < 0)
3998 3999 4000 4001 4002 4003 4004 4005 4006 4007
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
4008 4009 4010
			if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED)
				return -ENOSPC;

4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085
static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
					struct find_free_extent_ctl *ffe_ctl,
					struct btrfs_space_info *space_info,
					struct btrfs_key *ins)
{
	/*
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
	 */
	if (space_info->max_extent_size) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    ffe_ctl->num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&space_info->lock);
	}

	ffe_ctl->last_ptr = fetch_cluster_info(fs_info, space_info,
					       &ffe_ctl->empty_cluster);
	if (ffe_ctl->last_ptr) {
		struct btrfs_free_cluster *last_ptr = ffe_ctl->last_ptr;

		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			ffe_ctl->hint_byte = last_ptr->window_start;
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			ffe_ctl->hint_byte = last_ptr->window_start;
			ffe_ctl->use_cluster = false;
		}
		spin_unlock(&last_ptr->lock);
	}

	return 0;
}

static int prepare_allocation(struct btrfs_fs_info *fs_info,
			      struct find_free_extent_ctl *ffe_ctl,
			      struct btrfs_space_info *space_info,
			      struct btrfs_key *ins)
{
	switch (ffe_ctl->policy) {
	case BTRFS_EXTENT_ALLOC_CLUSTERED:
		return prepare_allocation_clustered(fs_info, ffe_ctl,
						    space_info, ins);
4086
	case BTRFS_EXTENT_ALLOC_ZONED:
4087 4088 4089 4090 4091 4092
		if (ffe_ctl->for_treelog) {
			spin_lock(&fs_info->treelog_bg_lock);
			if (fs_info->treelog_bg)
				ffe_ctl->hint_byte = fs_info->treelog_bg;
			spin_unlock(&fs_info->treelog_bg_lock);
		}
4093
		return 0;
4094 4095 4096 4097 4098
	default:
		BUG();
	}
}

4099 4100 4101
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
4102
 * ins->objectid == start position
4103
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
4104
 * ins->offset == the size of the hole.
4105
 * Any available blocks before search_start are skipped.
4106 4107 4108
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
4123
 */
4124
static noinline int find_free_extent(struct btrfs_root *root,
4125
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
4126
				u64 hint_byte_orig, struct btrfs_key *ins,
4127
				u64 flags, int delalloc)
4128
{
4129
	struct btrfs_fs_info *fs_info = root->fs_info;
4130
	int ret = 0;
4131
	int cache_block_group_error = 0;
4132
	struct btrfs_block_group *block_group = NULL;
4133
	struct find_free_extent_ctl ffe_ctl = {0};
4134
	struct btrfs_space_info *space_info;
4135
	bool full_search = false;
4136
	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4137

4138
	WARN_ON(num_bytes < fs_info->sectorsize);
4139 4140 4141 4142 4143 4144 4145 4146 4147 4148

	ffe_ctl.num_bytes = num_bytes;
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.search_start = 0;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	ffe_ctl.have_caching_bg = false;
	ffe_ctl.orig_have_caching_bg = false;
	ffe_ctl.found_offset = 0;
4149
	ffe_ctl.hint_byte = hint_byte_orig;
4150
	ffe_ctl.for_treelog = for_treelog;
4151
	ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED;
4152

4153 4154 4155 4156 4157 4158
	/* For clustered allocation */
	ffe_ctl.retry_clustered = false;
	ffe_ctl.retry_unclustered = false;
	ffe_ctl.last_ptr = NULL;
	ffe_ctl.use_cluster = true;

4159 4160 4161
	if (btrfs_is_zoned(fs_info))
		ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED;

4162
	ins->type = BTRFS_EXTENT_ITEM_KEY;
4163 4164
	ins->objectid = 0;
	ins->offset = 0;
4165

4166
	trace_find_free_extent(root, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
4167

4168
	space_info = btrfs_find_space_info(fs_info, flags);
4169
	if (!space_info) {
4170
		btrfs_err(fs_info, "No space info for %llu", flags);
4171 4172
		return -ENOSPC;
	}
J
Josef Bacik 已提交
4173

4174 4175 4176
	ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins);
	if (ret < 0)
		return ret;
4177

4178 4179
	ffe_ctl.search_start = max(ffe_ctl.search_start,
				   first_logical_byte(fs_info, 0));
4180 4181
	ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte);
	if (ffe_ctl.search_start == ffe_ctl.hint_byte) {
4182 4183
		block_group = btrfs_lookup_block_group(fs_info,
						       ffe_ctl.search_start);
J
Josef Bacik 已提交
4184 4185 4186
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
4187 4188 4189
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
4190
		 */
4191
		if (block_group && block_group_bits(block_group, flags) &&
4192
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
4193
			down_read(&space_info->groups_sem);
4194 4195 4196 4197 4198 4199 4200 4201 4202 4203
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
4204
			} else {
4205
				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
4206
						block_group->flags);
4207
				btrfs_lock_block_group(block_group, delalloc);
4208
				goto have_block_group;
4209
			}
J
Josef Bacik 已提交
4210
		} else if (block_group) {
4211
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
4212
		}
4213
	}
J
Josef Bacik 已提交
4214
search:
4215 4216 4217
	ffe_ctl.have_caching_bg = false;
	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
	    ffe_ctl.index == 0)
4218
		full_search = true;
4219
	down_read(&space_info->groups_sem);
4220 4221
	list_for_each_entry(block_group,
			    &space_info->block_groups[ffe_ctl.index], list) {
4222 4223
		struct btrfs_block_group *bg_ret;

4224
		/* If the block group is read-only, we can skip it entirely. */
4225 4226 4227
		if (unlikely(block_group->ro)) {
			if (for_treelog)
				btrfs_clear_treelog_bg(block_group);
4228
			continue;
4229
		}
4230

4231
		btrfs_grab_block_group(block_group, delalloc);
4232
		ffe_ctl.search_start = block_group->start;
4233

4234 4235 4236 4237 4238
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
4239
		if (!block_group_bits(block_group, flags)) {
4240
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4241
				BTRFS_BLOCK_GROUP_RAID1_MASK |
4242
				BTRFS_BLOCK_GROUP_RAID56_MASK |
4243 4244 4245 4246 4247 4248 4249
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
4250
			if ((flags & extra) && !(block_group->flags & extra))
4251
				goto loop;
4252 4253 4254 4255 4256 4257 4258 4259

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
			btrfs_release_block_group(block_group, delalloc);
			continue;
4260 4261
		}

J
Josef Bacik 已提交
4262
have_block_group:
4263
		ffe_ctl.cached = btrfs_block_group_done(block_group);
4264 4265
		if (unlikely(!ffe_ctl.cached)) {
			ffe_ctl.have_caching_bg = true;
4266
			ret = btrfs_cache_block_group(block_group, 0);
4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280

			/*
			 * If we get ENOMEM here or something else we want to
			 * try other block groups, because it may not be fatal.
			 * However if we can't find anything else we need to
			 * save our return here so that we return the actual
			 * error that caused problems, not ENOSPC.
			 */
			if (ret < 0) {
				if (!cache_block_group_error)
					cache_block_group_error = ret;
				ret = 0;
				goto loop;
			}
4281
			ret = 0;
J
Josef Bacik 已提交
4282 4283
		}

4284 4285
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
4286

4287 4288 4289 4290 4291 4292
		bg_ret = NULL;
		ret = do_allocation(block_group, &ffe_ctl, &bg_ret);
		if (ret == 0) {
			if (bg_ret && bg_ret != block_group) {
				btrfs_release_block_group(block_group, delalloc);
				block_group = bg_ret;
4293
			}
4294
		} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
4295
			goto have_block_group;
4296
		} else if (ret > 0) {
4297
			goto loop;
4298 4299 4300
		}

		/* Checks */
4301 4302
		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
					     fs_info->stripesize);
4303

J
Josef Bacik 已提交
4304
		/* move on to the next group */
4305
		if (ffe_ctl.search_start + num_bytes >
4306
		    block_group->start + block_group->length) {
4307 4308
			btrfs_add_free_space_unused(block_group,
					    ffe_ctl.found_offset, num_bytes);
J
Josef Bacik 已提交
4309
			goto loop;
4310
		}
4311

4312
		if (ffe_ctl.found_offset < ffe_ctl.search_start)
4313 4314 4315
			btrfs_add_free_space_unused(block_group,
					ffe_ctl.found_offset,
					ffe_ctl.search_start - ffe_ctl.found_offset);
J
Josef Bacik 已提交
4316

4317 4318
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
4319
		if (ret == -EAGAIN) {
4320 4321
			btrfs_add_free_space_unused(block_group,
					ffe_ctl.found_offset, num_bytes);
J
Josef Bacik 已提交
4322
			goto loop;
J
Josef Bacik 已提交
4323
		}
4324
		btrfs_inc_block_group_reservations(block_group);
4325

4326
		/* we are all good, lets return */
4327
		ins->objectid = ffe_ctl.search_start;
J
Josef Bacik 已提交
4328
		ins->offset = num_bytes;
4329

4330 4331
		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
					   num_bytes);
4332
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
4333 4334
		break;
loop:
4335
		release_block_group(block_group, &ffe_ctl, delalloc);
4336
		cond_resched();
J
Josef Bacik 已提交
4337 4338 4339
	}
	up_read(&space_info->groups_sem);

4340
	ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search);
4341
	if (ret > 0)
4342 4343
		goto search;

4344
	if (ret == -ENOSPC && !cache_block_group_error) {
4345 4346 4347 4348 4349 4350
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
		if (!ffe_ctl.max_extent_size)
			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4351
		spin_lock(&space_info->lock);
4352
		space_info->max_extent_size = ffe_ctl.max_extent_size;
4353
		spin_unlock(&space_info->lock);
4354
		ins->offset = ffe_ctl.max_extent_size;
4355 4356
	} else if (ret == -ENOSPC) {
		ret = cache_block_group_error;
4357
	}
C
Chris Mason 已提交
4358
	return ret;
4359
}
4360

4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4406
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4407 4408
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4409
			 struct btrfs_key *ins, int is_data, int delalloc)
4410
{
4411
	struct btrfs_fs_info *fs_info = root->fs_info;
4412
	bool final_tried = num_bytes == min_alloc_size;
4413
	u64 flags;
4414
	int ret;
4415
	bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4416

4417
	flags = get_alloc_profile_by_root(root, is_data);
4418
again:
4419
	WARN_ON(num_bytes < fs_info->sectorsize);
4420
	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
4421
			       hint_byte, ins, flags, delalloc);
4422
	if (!ret && !is_data) {
4423
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4424
	} else if (ret == -ENOSPC) {
4425 4426
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4427
			num_bytes = round_down(num_bytes,
4428
					       fs_info->sectorsize);
4429
			num_bytes = max(num_bytes, min_alloc_size);
4430
			ram_bytes = num_bytes;
4431 4432 4433
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4434
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4435 4436
			struct btrfs_space_info *sinfo;

4437
			sinfo = btrfs_find_space_info(fs_info, flags);
4438
			btrfs_err(fs_info,
4439 4440
			"allocation failed flags %llu, wanted %llu tree-log %d",
				  flags, num_bytes, for_treelog);
4441
			if (sinfo)
4442 4443
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4444
		}
4445
	}
J
Josef Bacik 已提交
4446 4447

	return ret;
4448 4449
}

4450 4451
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
			       u64 start, u64 len, int delalloc)
4452
{
4453
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
4454

4455
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4456
	if (!cache) {
4457 4458
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4459 4460
		return -ENOSPC;
	}
4461

4462 4463 4464 4465
	btrfs_add_free_space(cache, start, len);
	btrfs_free_reserved_bytes(cache, len, delalloc);
	trace_btrfs_reserved_extent_free(fs_info, start, len);

4466
	btrfs_put_block_group(cache);
4467
	return 0;
4468 4469
}

4470 4471
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start,
			      u64 len)
4472
{
4473
	struct btrfs_block_group *cache;
4474
	int ret = 0;
4475

4476
	cache = btrfs_lookup_block_group(trans->fs_info, start);
4477
	if (!cache) {
4478 4479
		btrfs_err(trans->fs_info, "unable to find block group for %llu",
			  start);
4480 4481 4482
		return -ENOSPC;
	}

4483
	ret = pin_down_extent(trans, cache, start, len, 1);
4484
	btrfs_put_block_group(cache);
4485
	return ret;
4486 4487
}

4488 4489 4490 4491
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4492
{
4493
	struct btrfs_fs_info *fs_info = trans->fs_info;
4494 4495
	int ret;
	struct btrfs_extent_item *extent_item;
4496
	struct btrfs_extent_inline_ref *iref;
4497
	struct btrfs_path *path;
4498 4499 4500
	struct extent_buffer *leaf;
	int type;
	u32 size;
4501

4502 4503 4504 4505
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4506

4507
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4508 4509

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4510 4511
	if (!path)
		return -ENOMEM;
4512

4513 4514
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
4515 4516 4517 4518
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4519

4520 4521
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4522
				     struct btrfs_extent_item);
4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4543 4544

	btrfs_mark_buffer_dirty(path->nodes[0]);
4545
	btrfs_free_path(path);
4546

4547
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4548 4549 4550
	if (ret)
		return ret;

4551
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4552
	if (ret) { /* -ENOENT, logic error */
4553
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4554
			ins->objectid, ins->offset);
4555 4556
		BUG();
	}
4557
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4558 4559 4560
	return ret;
}

4561
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4562
				     struct btrfs_delayed_ref_node *node,
4563
				     struct btrfs_delayed_extent_op *extent_op)
4564
{
4565
	struct btrfs_fs_info *fs_info = trans->fs_info;
4566
	int ret;
4567
	struct btrfs_extent_item *extent_item;
4568
	struct btrfs_key extent_key;
4569 4570 4571 4572
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4573
	struct btrfs_delayed_tree_ref *ref;
4574
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4575
	u64 num_bytes;
4576
	u64 flags = extent_op->flags_to_set;
4577
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4578

4579 4580 4581 4582 4583 4584 4585 4586 4587 4588
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4589
		size += sizeof(*block_info);
4590 4591
		num_bytes = node->num_bytes;
	}
4592

4593
	path = btrfs_alloc_path();
4594
	if (!path)
4595
		return -ENOMEM;
4596

4597
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4598
				      &extent_key, size);
4599
	if (ret) {
4600
		btrfs_free_path(path);
4601 4602
		return ret;
	}
4603 4604 4605 4606 4607 4608 4609 4610 4611

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4612 4613 4614 4615
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4616
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4617
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4618 4619
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4620

4621
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4622 4623
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4624
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4625 4626 4627
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4628
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4629 4630 4631 4632 4633
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4634 4635
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4636 4637 4638
	if (ret)
		return ret;

4639 4640
	ret = btrfs_update_block_group(trans, extent_key.objectid,
				       fs_info->nodesize, 1);
4641
	if (ret) { /* -ENOENT, logic error */
4642
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4643
			extent_key.objectid, extent_key.offset);
4644 4645
		BUG();
	}
J
Josef Bacik 已提交
4646

4647
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4648
					  fs_info->nodesize);
4649 4650 4651 4652
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4653
				     struct btrfs_root *root, u64 owner,
4654 4655
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4656
{
4657
	struct btrfs_ref generic_ref = { 0 };
4658

4659
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4660

4661 4662 4663
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4664
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4665 4666

	return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes);
4667
}
4668 4669 4670 4671 4672 4673

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4674 4675 4676
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4677
{
4678
	struct btrfs_fs_info *fs_info = trans->fs_info;
4679
	int ret;
4680
	struct btrfs_block_group *block_group;
4681
	struct btrfs_space_info *space_info;
4682

4683 4684
	/*
	 * Mixed block groups will exclude before processing the log so we only
4685
	 * need to do the exclude dance if this fs isn't mixed.
4686
	 */
4687
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4688 4689
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4690
		if (ret)
4691
			return ret;
4692 4693
	}

4694
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4695 4696 4697
	if (!block_group)
		return -EINVAL;

4698 4699 4700 4701 4702 4703 4704 4705
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4706 4707
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4708
	if (ret)
4709
		btrfs_pin_extent(trans, ins->objectid, ins->offset, 1);
4710
	btrfs_put_block_group(block_group);
4711 4712 4713
	return ret;
}

4714 4715
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4716 4717
		      u64 bytenr, int level, u64 owner,
		      enum btrfs_lock_nesting nest)
4718
{
4719
	struct btrfs_fs_info *fs_info = root->fs_info;
4720 4721
	struct extent_buffer *buf;

4722
	buf = btrfs_find_create_tree_block(fs_info, bytenr, owner, level);
4723 4724 4725
	if (IS_ERR(buf))
		return buf;

4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4739 4740 4741 4742 4743
	/*
	 * This needs to stay, because we could allocate a freed block from an
	 * old tree into a new tree, so we need to make sure this new block is
	 * set to the appropriate level and owner.
	 */
4744
	btrfs_set_buffer_lockdep_class(owner, buf, level);
4745
	__btrfs_tree_lock(buf, nest);
4746
	btrfs_clean_tree_block(buf);
4747
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4748
	clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
4749

4750
	set_extent_buffer_uptodate(buf);
4751

4752 4753 4754 4755 4756 4757
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4758
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4759
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4760
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4761
		buf->log_index = root->log_transid % 2;
4762 4763
		/*
		 * we allow two log transactions at a time, use different
4764
		 * EXTENT bit to differentiate dirty pages.
4765
		 */
4766
		if (buf->log_index == 0)
4767 4768 4769 4770
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4771
					buf->start + buf->len - 1);
4772
	} else {
4773
		buf->log_index = -1;
4774
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4775
			 buf->start + buf->len - 1, GFP_NOFS);
4776
	}
4777
	trans->dirty = true;
4778
	/* this returns a buffer locked for blocking */
4779 4780 4781
	return buf;
}

4782
/*
4783
 * finds a free extent and does all the dirty work required for allocation
4784
 * returns the tree buffer or an ERR_PTR on error.
4785
 */
4786
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4787 4788 4789 4790
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
4791 4792
					     u64 empty_size,
					     enum btrfs_lock_nesting nest)
4793
{
4794
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4795
	struct btrfs_key ins;
4796
	struct btrfs_block_rsv *block_rsv;
4797
	struct extent_buffer *buf;
4798
	struct btrfs_delayed_extent_op *extent_op;
4799
	struct btrfs_ref generic_ref = { 0 };
4800 4801
	u64 flags = 0;
	int ret;
4802 4803
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4804

4805
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4806
	if (btrfs_is_testing(fs_info)) {
4807
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4808
					    level, root_objectid, nest);
4809 4810 4811 4812
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4813
#endif
4814

4815
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4816 4817 4818
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4819
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4820
				   empty_size, hint, &ins, 0, 0);
4821 4822
	if (ret)
		goto out_unuse;
4823

4824
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4825
				    root_objectid, nest);
4826 4827 4828 4829
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4830 4831 4832 4833 4834 4835 4836 4837 4838

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4839
		extent_op = btrfs_alloc_delayed_extent_op();
4840 4841 4842 4843
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4844 4845 4846 4847 4848
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4849 4850 4851
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4852
		extent_op->level = level;
4853

4854 4855 4856 4857
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
		generic_ref.real_root = root->root_key.objectid;
		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4858
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4859
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op);
4860 4861
		if (ret)
			goto out_free_delayed;
4862
	}
4863
	return buf;
4864 4865 4866 4867 4868 4869

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
4870
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4871
out_unuse:
4872
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4873
	return ERR_PTR(ret);
4874
}
4875

4876 4877 4878 4879
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
4880 4881
	struct btrfs_key drop_progress;
	int drop_level;
4882 4883 4884 4885 4886
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
4887 4888
	int reada_slot;
	int reada_count;
4889
	int restarted;
4890 4891 4892 4893 4894
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
4895 4896 4897 4898
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
4899
{
4900
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4901 4902 4903
	u64 bytenr;
	u64 generation;
	u64 refs;
4904
	u64 flags;
4905
	u32 nritems;
Y
Yan, Zheng 已提交
4906 4907
	struct btrfs_key key;
	struct extent_buffer *eb;
4908
	int ret;
Y
Yan, Zheng 已提交
4909 4910
	int slot;
	int nread = 0;
4911

Y
Yan, Zheng 已提交
4912 4913 4914 4915 4916 4917
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
4918
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
4919
	}
4920

Y
Yan, Zheng 已提交
4921 4922
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
4923

Y
Yan, Zheng 已提交
4924 4925 4926
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
4927

C
Chris Mason 已提交
4928
		cond_resched();
Y
Yan, Zheng 已提交
4929 4930
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
4931

Y
Yan, Zheng 已提交
4932 4933
		if (slot == path->slots[wc->level])
			goto reada;
4934

Y
Yan, Zheng 已提交
4935 4936
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
4937 4938
			continue;

4939
		/* We don't lock the tree block, it's OK to be racy here */
4940
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4941 4942
					       wc->level - 1, 1, &refs,
					       &flags);
4943 4944 4945
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
4946 4947
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
4948 4949 4950
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
4951

4952 4953 4954
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
4955 4956 4957 4958 4959 4960 4961 4962
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
4963 4964 4965 4966
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
4967
		}
Y
Yan, Zheng 已提交
4968
reada:
4969
		btrfs_readahead_node_child(eb, slot);
Y
Yan, Zheng 已提交
4970
		nread++;
C
Chris Mason 已提交
4971
	}
Y
Yan, Zheng 已提交
4972
	wc->reada_slot = slot;
C
Chris Mason 已提交
4973
}
4974

Y
Yan Zheng 已提交
4975
/*
L
Liu Bo 已提交
4976
 * helper to process tree block while walking down the tree.
4977 4978 4979 4980 4981
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
4982
 */
4983
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4984
				   struct btrfs_root *root,
4985
				   struct btrfs_path *path,
4986
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
4987
{
4988
	struct btrfs_fs_info *fs_info = root->fs_info;
4989 4990 4991
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
4992 4993
	int ret;

4994 4995 4996
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
4997

4998 4999 5000 5001
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
5002 5003 5004
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
5005
		BUG_ON(!path->locks[level]);
5006
		ret = btrfs_lookup_extent_info(trans, fs_info,
5007
					       eb->start, level, 1,
5008 5009
					       &wc->refs[level],
					       &wc->flags[level]);
5010 5011 5012
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
5013 5014
		BUG_ON(wc->refs[level] == 0);
	}
5015

5016 5017 5018
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
5019

5020
		if (path->locks[level] && !wc->keep_locks) {
5021
			btrfs_tree_unlock_rw(eb, path->locks[level]);
5022 5023 5024 5025
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
5026

5027 5028 5029
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
5030
		ret = btrfs_inc_ref(trans, root, eb, 1);
5031
		BUG_ON(ret); /* -ENOMEM */
5032
		ret = btrfs_dec_ref(trans, root, eb, 0);
5033
		BUG_ON(ret); /* -ENOMEM */
5034
		ret = btrfs_set_disk_extent_flags(trans, eb, flag,
5035
						  btrfs_header_level(eb), 0);
5036
		BUG_ON(ret); /* -ENOMEM */
5037 5038 5039 5040 5041 5042 5043 5044
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
5045
		btrfs_tree_unlock_rw(eb, path->locks[level]);
5046 5047 5048 5049 5050
		path->locks[level] = 0;
	}
	return 0;
}

5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
5078
/*
L
Liu Bo 已提交
5079
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
5094
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
5095
{
5096
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
5097 5098 5099 5100
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
5101
	struct btrfs_key first_key;
5102
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
5103 5104 5105 5106
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
5107
	bool need_account = false;
Y
Yan, Zheng 已提交
5108 5109 5110 5111 5112 5113 5114 5115 5116

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
5117 5118
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5119
		return 1;
5120
	}
Y
Yan, Zheng 已提交
5121 5122

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
5123 5124
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
5125

5126
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
5127
	if (!next) {
5128 5129
		next = btrfs_find_create_tree_block(fs_info, bytenr,
				root->root_key.objectid, level - 1);
5130 5131
		if (IS_ERR(next))
			return PTR_ERR(next);
Y
Yan, Zheng 已提交
5132 5133 5134 5135
		reada = 1;
	}
	btrfs_tree_lock(next);

5136
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
5137 5138
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
5139 5140
	if (ret < 0)
		goto out_unlock;
5141

5142
	if (unlikely(wc->refs[level - 1] == 0)) {
5143
		btrfs_err(fs_info, "Missing references.");
5144 5145
		ret = -EIO;
		goto out_unlock;
5146
	}
5147
	*lookup_info = 0;
Y
Yan, Zheng 已提交
5148

5149
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
5150
		if (wc->refs[level - 1] > 1) {
5151
			need_account = true;
5152 5153 5154 5155
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
5169 5170 5171 5172
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
5173 5174
	}

5175
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
5176 5177 5178
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
5179
		*lookup_info = 1;
Y
Yan, Zheng 已提交
5180 5181 5182 5183 5184
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
5185 5186
		next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
				       generation, level - 1, &first_key);
5187 5188 5189
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
5190
			free_extent_buffer(next);
5191
			return -EIO;
5192
		}
Y
Yan, Zheng 已提交
5193 5194 5195 5196
		btrfs_tree_lock(next);
	}

	level--;
5197 5198 5199 5200 5201 5202
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
5203 5204
	path->nodes[level] = next;
	path->slots[level] = 0;
5205
	path->locks[level] = BTRFS_WRITE_LOCK;
Y
Yan, Zheng 已提交
5206 5207 5208 5209 5210 5211 5212
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
5213 5214 5215 5216
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
5217
			ASSERT(root->root_key.objectid ==
5218
			       btrfs_header_owner(path->nodes[level]));
5219 5220 5221 5222 5223 5224 5225
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
5226 5227
			parent = 0;
		}
Y
Yan, Zheng 已提交
5228

5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

5246 5247 5248 5249 5250 5251 5252
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
5253
			ret = btrfs_qgroup_trace_subtree(trans, next,
5254
							 generation, level - 1);
5255
			if (ret) {
5256
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
5257 5258
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
5259 5260
			}
		}
5261 5262 5263 5264 5265 5266 5267 5268 5269 5270

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

5271 5272 5273 5274
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
		ret = btrfs_free_extent(trans, &ref);
5275 5276
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
5277
	}
5278
no_delete:
5279 5280 5281 5282
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
5283 5284
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
5285 5286

	return ret;
Y
Yan, Zheng 已提交
5287 5288
}

5289
/*
L
Liu Bo 已提交
5290
 * helper to process tree block while walking up the tree.
5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
5306
	struct btrfs_fs_info *fs_info = root->fs_info;
5307
	int ret;
5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5333
			path->locks[level] = BTRFS_WRITE_LOCK;
5334

5335
			ret = btrfs_lookup_extent_info(trans, fs_info,
5336
						       eb->start, level, 1,
5337 5338
						       &wc->refs[level],
						       &wc->flags[level]);
5339 5340
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5341
				path->locks[level] = 0;
5342 5343
				return ret;
			}
5344 5345
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5346
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5347
				path->locks[level] = 0;
5348 5349
				return 1;
			}
Y
Yan Zheng 已提交
5350
		}
5351
	}
Y
Yan Zheng 已提交
5352

5353 5354
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5355

5356 5357 5358
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5359
				ret = btrfs_dec_ref(trans, root, eb, 1);
5360
			else
5361
				ret = btrfs_dec_ref(trans, root, eb, 0);
5362
			BUG_ON(ret); /* -ENOMEM */
5363 5364 5365 5366 5367
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5368
					     ret);
5369
				}
5370
			}
5371
		}
5372
		/* make block locked assertion in btrfs_clean_tree_block happy */
5373 5374 5375
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5376
			path->locks[level] = BTRFS_WRITE_LOCK;
5377
		}
5378
		btrfs_clean_tree_block(eb);
5379 5380 5381 5382 5383
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5384 5385
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5386 5387 5388
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5389 5390 5391
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5392 5393
	}

5394
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5395 5396 5397
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5398
	return 0;
5399 5400 5401 5402 5403

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5404 5405 5406 5407 5408 5409 5410 5411
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5412
	int lookup_info = 1;
5413 5414 5415
	int ret;

	while (level >= 0) {
5416
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5417 5418 5419 5420 5421 5422
		if (ret > 0)
			break;

		if (level == 0)
			break;

5423 5424 5425 5426
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5427
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5428 5429 5430
		if (ret > 0) {
			path->slots[level]++;
			continue;
5431 5432
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5433
		level = wc->level;
Y
Yan Zheng 已提交
5434 5435 5436 5437
	}
	return 0;
}

C
Chris Mason 已提交
5438
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5439
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5440
				 struct btrfs_path *path,
5441
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5442
{
5443
	int level = wc->level;
C
Chris Mason 已提交
5444
	int ret;
5445

5446 5447 5448 5449 5450 5451
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5452 5453
			return 0;
		} else {
5454 5455 5456
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5457 5458
			if (ret < 0)
				return ret;
5459

5460
			if (path->locks[level]) {
5461 5462
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5463
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5464
			}
5465 5466 5467
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5468 5469 5470 5471 5472
		}
	}
	return 1;
}

C
Chris Mason 已提交
5473
/*
5474 5475 5476 5477 5478 5479 5480 5481 5482
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5483 5484
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5485
 */
5486
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
C
Chris Mason 已提交
5487
{
5488
	struct btrfs_fs_info *fs_info = root->fs_info;
5489
	struct btrfs_path *path;
5490
	struct btrfs_trans_handle *trans;
5491
	struct btrfs_root *tree_root = fs_info->tree_root;
5492
	struct btrfs_root_item *root_item = &root->root_item;
5493 5494 5495 5496 5497
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5498
	bool root_dropped = false;
C
Chris Mason 已提交
5499

5500
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5501

5502
	path = btrfs_alloc_path();
5503 5504 5505 5506
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5507

5508
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5509 5510
	if (!wc) {
		btrfs_free_path(path);
5511 5512
		err = -ENOMEM;
		goto out;
5513
	}
5514

5515 5516 5517 5518 5519 5520 5521 5522
	/*
	 * Use join to avoid potential EINTR from transaction start. See
	 * wait_reserve_ticket and the whole reservation callchain.
	 */
	if (for_reloc)
		trans = btrfs_join_transaction(tree_root);
	else
		trans = btrfs_start_transaction(tree_root, 0);
5523 5524 5525 5526
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5527

5528 5529 5530 5531
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5532 5533 5534 5535 5536 5537 5538 5539 5540
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5541
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5542
		level = btrfs_header_level(root->node);
5543
		path->nodes[level] = btrfs_lock_root_node(root);
5544
		path->slots[level] = 0;
5545
		path->locks[level] = BTRFS_WRITE_LOCK;
5546 5547
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5548 5549
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5550 5551 5552
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5553
		level = btrfs_root_drop_level(root_item);
5554
		BUG_ON(level == 0);
5555
		path->lowest_level = level;
5556 5557 5558 5559
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5560
			goto out_end_trans;
5561
		}
Y
Yan, Zheng 已提交
5562
		WARN_ON(ret > 0);
5563

5564 5565 5566 5567
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5568
		btrfs_unlock_up_safe(path, 0);
5569 5570 5571 5572

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5573
			path->locks[level] = BTRFS_WRITE_LOCK;
5574

5575
			ret = btrfs_lookup_extent_info(trans, fs_info,
5576
						path->nodes[level]->start,
5577
						level, 1, &wc->refs[level],
5578
						&wc->flags[level]);
5579 5580 5581 5582
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5583 5584
			BUG_ON(wc->refs[level] == 0);

5585
			if (level == btrfs_root_drop_level(root_item))
5586 5587 5588
				break;

			btrfs_tree_unlock(path->nodes[level]);
5589
			path->locks[level] = 0;
5590 5591 5592
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5593
	}
5594

5595
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5596 5597 5598 5599 5600
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5601
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5602

C
Chris Mason 已提交
5603
	while (1) {
D
David Sterba 已提交
5604

5605 5606 5607
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5608
			break;
5609
		}
C
Chris Mason 已提交
5610

5611 5612 5613
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5614
			break;
5615 5616 5617 5618
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5619 5620
			break;
		}
5621 5622

		if (wc->stage == DROP_REFERENCE) {
5623 5624 5625 5626 5627 5628 5629
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
5630
		btrfs_set_root_drop_level(root_item, wc->drop_level);
5631 5632

		BUG_ON(wc->level == 0);
5633
		if (btrfs_should_end_transaction(trans) ||
5634
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5635 5636 5637
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5638
			if (ret) {
5639
				btrfs_abort_transaction(trans, ret);
5640 5641 5642
				err = ret;
				goto out_end_trans;
			}
5643

5644
			btrfs_end_transaction_throttle(trans);
5645
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5646 5647
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5648 5649 5650 5651
				err = -EAGAIN;
				goto out_free;
			}

5652 5653 5654 5655 5656 5657 5658 5659 5660
		       /*
			* Use join to avoid potential EINTR from transaction
			* start. See wait_reserve_ticket and the whole
			* reservation callchain.
			*/
			if (for_reloc)
				trans = btrfs_join_transaction(tree_root);
			else
				trans = btrfs_start_transaction(tree_root, 0);
5661 5662 5663 5664
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5665
		}
C
Chris Mason 已提交
5666
	}
5667
	btrfs_release_path(path);
5668 5669
	if (err)
		goto out_end_trans;
5670

5671
	ret = btrfs_del_root(trans, &root->root_key);
5672
	if (ret) {
5673
		btrfs_abort_transaction(trans, ret);
5674
		err = ret;
5675 5676
		goto out_end_trans;
	}
5677

5678
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5679 5680
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5681
		if (ret < 0) {
5682
			btrfs_abort_transaction(trans, ret);
5683 5684 5685
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5686 5687 5688 5689 5690 5691 5692
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5693 5694 5695
		}
	}

5696 5697 5698 5699 5700 5701 5702 5703
	/*
	 * This subvolume is going to be completely dropped, and won't be
	 * recorded as dirty roots, thus pertrans meta rsv will not be freed at
	 * commit transaction time.  So free it here manually.
	 */
	btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
	btrfs_qgroup_free_meta_all_pertrans(root);

5704
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
5705
		btrfs_add_dropped_root(trans, root);
5706
	else
5707
		btrfs_put_root(root);
5708
	root_dropped = true;
5709
out_end_trans:
5710
	btrfs_end_transaction_throttle(trans);
5711
out_free:
5712
	kfree(wc);
5713
	btrfs_free_path(path);
5714
out:
5715 5716 5717 5718 5719 5720 5721
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5722
	if (!for_reloc && !root_dropped)
5723
		btrfs_add_dead_root(root);
5724
	return err;
C
Chris Mason 已提交
5725
}
C
Chris Mason 已提交
5726

5727 5728 5729 5730
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5731
 * only used by relocation code
5732
 */
Y
Yan Zheng 已提交
5733 5734 5735 5736 5737
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5738
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5739
	struct btrfs_path *path;
5740
	struct walk_control *wc;
Y
Yan Zheng 已提交
5741 5742 5743 5744 5745
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5746 5747
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5748
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5749 5750
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5751

5752
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5753 5754 5755 5756
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5757

5758
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
5759
	parent_level = btrfs_header_level(parent);
D
David Sterba 已提交
5760
	atomic_inc(&parent->refs);
Y
Yan Zheng 已提交
5761 5762 5763
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5764
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
5765 5766 5767
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5768
	path->locks[level] = BTRFS_WRITE_LOCK;
5769 5770 5771 5772 5773 5774 5775 5776

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5777
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5778 5779

	while (1) {
5780 5781
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5782 5783
			ret = wret;
			break;
5784
		}
Y
Yan Zheng 已提交
5785

5786
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5787 5788 5789 5790 5791 5792
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5793
	kfree(wc);
Y
Yan Zheng 已提交
5794 5795 5796 5797
	btrfs_free_path(path);
	return ret;
}

5798 5799
/*
 * helper to account the unused space of all the readonly block group in the
5800
 * space_info. takes mirrors into account.
5801
 */
5802
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5803
{
5804
	struct btrfs_block_group *block_group;
5805 5806 5807
	u64 free_bytes = 0;
	int factor;

5808
	/* It's df, we don't care if it's racy */
5809 5810 5811 5812 5813
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5814 5815 5816 5817 5818 5819 5820
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5821
		factor = btrfs_bg_type_to_factor(block_group->flags);
5822
		free_bytes += (block_group->length -
5823
			       block_group->used) * factor;
5824 5825 5826 5827 5828 5829 5830 5831

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5832 5833
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5834
{
5835
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5836 5837
}

5838 5839 5840 5841 5842 5843 5844 5845 5846
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5847 5848
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5849 5850 5851 5852 5853
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5854 5855 5856
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5857
 */
5858
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5859
{
5860
	u64 start = SZ_1M, len = 0, end = 0;
5861 5862 5863 5864
	int ret;

	*trimmed = 0;

5865 5866 5867 5868
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5869
	/* Not writable = nothing to do. */
5870
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5871 5872 5873 5874 5875 5876 5877 5878 5879
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
5880
		struct btrfs_fs_info *fs_info = device->fs_info;
5881 5882 5883 5884
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
5885
			break;
5886

5887 5888 5889
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5890

5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903
		/* Check if there are any CHUNK_* bits left */
		if (start > device->total_bytes) {
			WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
			btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
					  start, end - start + 1,
					  rcu_str_deref(device->name),
					  device->total_bytes);
			mutex_unlock(&fs_info->chunk_mutex);
			ret = 0;
			break;
		}

5904 5905 5906
		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

5907 5908 5909 5910 5911 5912
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
5913

5914
		len = end - start + 1;
5915

5916 5917
		/* We didn't find any extents */
		if (!len) {
5918
			mutex_unlock(&fs_info->chunk_mutex);
5919
			ret = 0;
5920 5921 5922
			break;
		}

5923 5924 5925 5926 5927 5928
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

5948 5949 5950 5951 5952 5953 5954 5955 5956
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
5957
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5958
{
5959
	struct btrfs_block_group *cache = NULL;
5960 5961
	struct btrfs_device *device;
	struct list_head *devices;
5962
	u64 group_trimmed;
5963
	u64 range_end = U64_MAX;
5964 5965 5966
	u64 start;
	u64 end;
	u64 trimmed = 0;
5967 5968 5969 5970
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
5971 5972
	int ret = 0;

5973 5974 5975 5976 5977 5978 5979 5980
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

5981
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5982
	for (; cache; cache = btrfs_next_block_group(cache)) {
5983
		if (cache->start >= range_end) {
5984 5985 5986 5987
			btrfs_put_block_group(cache);
			break;
		}

5988 5989
		start = max(range->start, cache->start);
		end = min(range_end, cache->start + cache->length);
5990 5991

		if (end - start >= range->minlen) {
5992
			if (!btrfs_block_group_done(cache)) {
5993
				ret = btrfs_cache_block_group(cache, 0);
5994
				if (ret) {
5995 5996 5997
					bg_failed++;
					bg_ret = ret;
					continue;
5998
				}
5999
				ret = btrfs_wait_block_group_cache_done(cache);
6000
				if (ret) {
6001 6002 6003
					bg_failed++;
					bg_ret = ret;
					continue;
6004
				}
6005 6006 6007 6008 6009 6010 6011 6012 6013
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
6014 6015 6016
				bg_failed++;
				bg_ret = ret;
				continue;
6017 6018 6019 6020
			}
		}
	}

6021 6022 6023 6024
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
6025
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
6026 6027
	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
6028
		ret = btrfs_trim_free_extents(device, &group_trimmed);
6029 6030 6031
		if (ret) {
			dev_failed++;
			dev_ret = ret;
6032
			break;
6033
		}
6034 6035 6036

		trimmed += group_trimmed;
	}
6037
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
6038

6039 6040 6041 6042
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
6043
	range->len = trimmed;
6044 6045 6046
	if (bg_ret)
		return bg_ret;
	return dev_ret;
6047
}