extent-tree.c 155.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "misc.h"
20
#include "tree-log.h"
21 22
#include "disk-io.h"
#include "print-tree.h"
23
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "locking.h"
26
#include "free-space-cache.h"
27
#include "free-space-tree.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35
#include "discard.h"
36

37 38
#undef SCRAMBLE_DELAYED_REFS

39

40
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
41 42 43 44
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
45 46 47 48 49 50 51 52
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
53
				     struct btrfs_delayed_ref_node *node,
54
				     struct btrfs_delayed_extent_op *extent_op);
55 56
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
57

58
static int block_group_bits(struct btrfs_block_group *cache, u64 bits)
J
Josef Bacik 已提交
59 60 61 62
{
	return (cache->flags & bits) == bits;
}

63 64
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
65
{
66
	u64 end = start + num_bytes - 1;
67
	set_extent_bits(&fs_info->freed_extents[0],
68
			start, end, EXTENT_UPTODATE);
69
	set_extent_bits(&fs_info->freed_extents[1],
70
			start, end, EXTENT_UPTODATE);
71 72
	return 0;
}
J
Josef Bacik 已提交
73

74
void btrfs_free_excluded_extents(struct btrfs_block_group *cache)
75
{
76
	struct btrfs_fs_info *fs_info = cache->fs_info;
77
	u64 start, end;
J
Josef Bacik 已提交
78

79 80
	start = cache->start;
	end = start + cache->length - 1;
81

82
	clear_extent_bits(&fs_info->freed_extents[0],
83
			  start, end, EXTENT_UPTODATE);
84
	clear_extent_bits(&fs_info->freed_extents[1],
85
			  start, end, EXTENT_UPTODATE);
J
Josef Bacik 已提交
86 87
}

88
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
89
{
90 91
	if (ref->type == BTRFS_REF_METADATA) {
		if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
92
			return BTRFS_BLOCK_GROUP_SYSTEM;
93
		else
94
			return BTRFS_BLOCK_GROUP_METADATA;
95
	}
96 97 98 99 100 101 102 103 104
	return BTRFS_BLOCK_GROUP_DATA;
}

static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);

105
	space_info = btrfs_find_space_info(fs_info, flags);
106 107 108 109 110 111 112 113 114 115
	ASSERT(space_info);
	percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
}

static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);
116

117
	space_info = btrfs_find_space_info(fs_info, flags);
118
	ASSERT(space_info);
119
	percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
120
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
121 122
}

123
/* simple helper to search for an existing data extent at a given offset */
124
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
125 126 127
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
128
	struct btrfs_path *path;
129

Z
Zheng Yan 已提交
130
	path = btrfs_alloc_path();
131 132 133
	if (!path)
		return -ENOMEM;

134 135
	key.objectid = start;
	key.offset = len;
136
	key.type = BTRFS_EXTENT_ITEM_KEY;
137
	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
Z
Zheng Yan 已提交
138
	btrfs_free_path(path);
139 140 141
	return ret;
}

142
/*
143
 * helper function to lookup reference count and flags of a tree block.
144 145 146 147 148 149 150 151
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
152
			     struct btrfs_fs_info *fs_info, u64 bytenr,
153
			     u64 offset, int metadata, u64 *refs, u64 *flags)
154 155 156 157 158 159 160 161 162 163 164 165
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

166 167 168 169
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
170 171
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
172 173 174
		metadata = 0;
	}

175 176 177 178 179 180 181 182
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
183 184 185 186 187 188 189 190 191

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

192
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
193 194 195
	if (ret < 0)
		goto out_free;

196
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
197 198 199 200 201 202
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
203
			    key.offset == fs_info->nodesize)
204 205
				ret = 0;
		}
206 207
	}

208 209 210 211 212 213 214 215 216
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
217 218 219 220 221 222 223 224
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
225
		}
226

227 228 229 230 231 232 233 234 235 236 237 238
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
239
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
240 241
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
242
			refcount_inc(&head->refs);
243 244
			spin_unlock(&delayed_refs->lock);

245
			btrfs_release_path(path);
246

247 248 249 250
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
251 252
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
253
			btrfs_put_delayed_ref_head(head);
254
			goto search_again;
255
		}
256
		spin_lock(&head->lock);
257 258 259 260 261
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

262
		num_refs += head->ref_mod;
263
		spin_unlock(&head->lock);
264 265 266 267 268 269 270 271 272 273 274 275 276 277
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

278 279 280 281 282 283 284 285 286 287 288 289 290 291
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
310
 * When a tree block is COWed through a tree, there are four cases:
311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
337 338 339
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
340
 * - different files inside a single subvolume
341 342
 * - different offsets inside a file (bookend extents in file.c)
 *
343
 * The extent ref structure for the implicit back refs has fields for:
344 345 346
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
347 348
 * - original offset in the file
 * - how many bookend extents
349
 *
350 351
 * The key offset for the implicit back refs is hash of the first
 * three fields.
352
 *
353
 * The extent ref structure for the full back refs has field for:
354
 *
355
 * - number of pointers in the tree leaf
356
 *
357 358
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
359
 *
360 361
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
362
 *
363
 *     (root_key.objectid, inode objectid, offset in file, 1)
364
 *
365 366
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
367
 *
368
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
369
 *
370
 * Btree extents can be referenced by:
371
 *
372
 * - Different subvolumes
373
 *
374 375 376 377
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
378
 *
379 380 381
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
382
 */
Z
Zheng Yan 已提交
383

384 385
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
386
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
387 388 389 390 391 392 393
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
394
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
395 396 397 398 399 400

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
401
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
402
				return type;
403 404 405 406 407 408 409 410 411 412 413
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
414
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
415
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
416
				return type;
417 418 419 420 421 422 423 424 425 426 427
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
428 429 430 431 432 433 434 435 436 437 438 439 440 441
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
		  eb->start, type);
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

442
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
443 444 445 446 447 448
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
449
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
450
	lenum = cpu_to_le64(owner);
451
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
452
	lenum = cpu_to_le64(offset);
453
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
483
	struct btrfs_root *root = trans->fs_info->extent_root;
484 485
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
486
	struct extent_buffer *leaf;
487
	u32 nritems;
488
	int ret;
489 490
	int recow;
	int err = -ENOENT;
491

Z
Zheng Yan 已提交
492
	key.objectid = bytenr;
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
508

509 510 511 512
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
513 514 515
	}

	leaf = path->nodes[0];
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
541
				btrfs_release_path(path);
542 543 544 545 546 547
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
548
	}
549 550
fail:
	return err;
Z
Zheng Yan 已提交
551 552
}

553 554 555 556 557
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
558
{
559
	struct btrfs_root *root = trans->fs_info->extent_root;
Z
Zheng Yan 已提交
560 561
	struct btrfs_key key;
	struct extent_buffer *leaf;
562
	u32 size;
Z
Zheng Yan 已提交
563 564
	u32 num_refs;
	int ret;
565 566

	key.objectid = bytenr;
567 568 569 570 571 572 573 574 575 576
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
577

578 579 580 581 582 583 584
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
585
		ref = btrfs_item_ptr(leaf, path->slots[0],
586 587 588 589 590 591 592
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
593
		}
594 595 596 597 598 599 600 601
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
602
			btrfs_release_path(path);
603 604 605 606 607
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
608

609 610 611 612 613 614 615 616 617 618 619 620 621 622
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
623 624
		}
	}
625 626 627
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
628
	btrfs_release_path(path);
629
	return ret;
630 631
}

632 633
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
634
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
635
{
636 637 638
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
639
	struct extent_buffer *leaf;
640
	u32 num_refs = 0;
Z
Zheng Yan 已提交
641 642 643
	int ret = 0;

	leaf = path->nodes[0];
644 645 646 647 648 649 650 651 652 653
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
654
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
655 656 657
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
658 659 660 661
	} else {
		BUG();
	}

662 663
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
664

Z
Zheng Yan 已提交
665
	if (num_refs == 0) {
666
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
J
Josef Bacik 已提交
667
		*last_ref = 1;
Z
Zheng Yan 已提交
668
	} else {
669 670 671 672
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
673 674 675 676 677
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

678
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
679
					  struct btrfs_extent_inline_ref *iref)
680
{
681 682 683 684 685
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
686
	int type;
687 688 689

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
690 691

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
692
	if (iref) {
693 694 695 696 697 698 699
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
719

720 721 722 723
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
724
{
725
	struct btrfs_root *root = trans->fs_info->extent_root;
726
	struct btrfs_key key;
727 728
	int ret;

729 730 731 732 733 734 735
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
736 737
	}

738 739 740 741
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
742 743
}

744 745 746 747
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
748
{
749
	struct btrfs_key key;
Z
Zheng Yan 已提交
750 751
	int ret;

752 753 754 755 756 757 758 759 760
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

761
	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
762
				      path, &key, 0);
763
	btrfs_release_path(path);
Z
Zheng Yan 已提交
764 765 766
	return ret;
}

767
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
768
{
769 770 771 772 773 774 775 776 777 778 779 780 781
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
782
}
783

784 785
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
786

C
Chris Mason 已提交
787
{
788
	for (; level < BTRFS_MAX_LEVEL; level++) {
789 790 791 792 793 794 795 796 797 798 799 800 801 802 803
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
804

805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
826
	struct btrfs_fs_info *fs_info = trans->fs_info;
827
	struct btrfs_root *root = fs_info->extent_root;
828 829 830 831 832 833 834 835 836 837 838 839 840
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
841
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
842
	int needed;
843

844
	key.objectid = bytenr;
Z
Zheng Yan 已提交
845
	key.type = BTRFS_EXTENT_ITEM_KEY;
846
	key.offset = num_bytes;
Z
Zheng Yan 已提交
847

848 849 850
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
851
		path->keep_locks = 1;
852 853
	} else
		extra_size = -1;
854 855

	/*
856 857
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
858 859 860 861 862 863 864
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
865
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
866
	if (ret < 0) {
867 868 869
		err = ret;
		goto out;
	}
870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
887
			key.objectid = bytenr;
888 889 890 891 892 893 894
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

895 896 897
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
898
	} else if (WARN_ON(ret)) {
899 900
		err = -EIO;
		goto out;
901
	}
902 903 904

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
905
	if (unlikely(item_size < sizeof(*ei))) {
906 907 908 909 910
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
911 912 913 914 915 916 917

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

918
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
919 920 921 922
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

923 924 925 926 927
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

928 929 930 931 932 933 934
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
935 936
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
937
			err = -EUCLEAN;
938 939 940
			goto out;
		}

941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
992 993
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
994
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
995 996 997 998 999 1000
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
1001
	if (insert) {
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
		path->keep_locks = 0;
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
1012
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1013 1014 1015 1016 1017
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1035
	btrfs_extend_item(path, size);
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1080 1081 1082
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1083
	if (ret != -ENOENT)
1084
		return ret;
1085

1086
	btrfs_release_path(path);
1087 1088 1089
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1090 1091
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1092
	} else {
1093 1094
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1095
	}
1096 1097
	return ret;
}
Z
Zheng Yan 已提交
1098

1099 1100 1101 1102
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1103
void update_inline_extent_backref(struct btrfs_path *path,
1104 1105
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1106 1107
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1108
{
1109
	struct extent_buffer *leaf = path->nodes[0];
1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1128 1129 1130 1131 1132 1133
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1144
	}
Z
Zheng Yan 已提交
1145

1146 1147 1148 1149 1150 1151 1152 1153 1154
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1155
		*last_ref = 1;
1156 1157 1158 1159 1160 1161 1162 1163
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1164
		btrfs_truncate_item(path, item_size, 1);
1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1180 1181 1182
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1183 1184
	if (ret == 0) {
		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1185 1186
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1187
	} else if (ret == -ENOENT) {
1188
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1189 1190 1191
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1192
	}
1193 1194
	return ret;
}
Z
Zheng Yan 已提交
1195

1196 1197 1198 1199 1200 1201 1202 1203
static int insert_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add)
{
	int ret;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
1204 1205
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1206
	} else {
1207 1208 1209
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
1210 1211 1212
	}
	return ret;
}
1213

1214 1215 1216
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1217
				 int refs_to_drop, int is_data, int *last_ref)
1218
{
1219
	int ret = 0;
1220

1221 1222
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1223 1224
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1225
	} else if (is_data) {
1226
		ret = remove_extent_data_ref(trans, path, refs_to_drop,
J
Josef Bacik 已提交
1227
					     last_ref);
1228
	} else {
J
Josef Bacik 已提交
1229
		*last_ref = 1;
1230
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1231 1232 1233 1234
	}
	return ret;
}

1235 1236
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1237
{
1238 1239
	int j, ret = 0;
	u64 bytes_left, end;
1240
	u64 aligned_start = ALIGN(start, 1 << 9);
1241

1242 1243 1244 1245 1246
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1247

1248
	*discarded_bytes = 0;
1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1300 1301
					   GFP_NOFS, 0);
		if (!ret)
1302
			*discarded_bytes += bytes_left;
1303
	}
1304
	return ret;
1305 1306
}

1307
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1308
			 u64 num_bytes, u64 *actual_bytes)
1309
{
1310
	int ret = 0;
1311
	u64 discarded_bytes = 0;
1312 1313
	u64 end = bytenr + num_bytes;
	u64 cur = bytenr;
1314
	struct btrfs_bio *bbio = NULL;
1315

C
Christoph Hellwig 已提交
1316

1317 1318 1319 1320
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
1321
	btrfs_bio_counter_inc_blocked(fs_info);
1322 1323
	while (cur < end) {
		struct btrfs_bio_stripe *stripe;
1324 1325
		int i;

1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
		num_bytes = end - cur;
		/* Tell the block device(s) that the sectors can be discarded */
		ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
				      &num_bytes, &bbio, 0);
		/*
		 * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
		 * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
		 * thus we can't continue anyway.
		 */
		if (ret < 0)
			goto out;
1337

1338
		stripe = bbio->stripes;
1339
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1340
			u64 bytes;
1341 1342
			struct request_queue *req_q;

1343 1344 1345 1346
			if (!stripe->dev->bdev) {
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1347 1348
			req_q = bdev_get_queue(stripe->dev->bdev);
			if (!blk_queue_discard(req_q))
1349 1350
				continue;

1351 1352
			ret = btrfs_issue_discard(stripe->dev->bdev,
						  stripe->physical,
1353 1354
						  stripe->length,
						  &bytes);
1355
			if (!ret) {
1356
				discarded_bytes += bytes;
1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
			} else if (ret != -EOPNOTSUPP) {
				/*
				 * Logic errors or -ENOMEM, or -EIO, but
				 * unlikely to happen.
				 *
				 * And since there are two loops, explicitly
				 * go to out to avoid confusion.
				 */
				btrfs_put_bbio(bbio);
				goto out;
			}
1368 1369 1370 1371 1372 1373 1374

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1375
		}
1376
		btrfs_put_bbio(bbio);
1377
		cur += num_bytes;
1378
	}
1379
out:
1380
	btrfs_bio_counter_dec(fs_info);
1381 1382 1383 1384

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1385

D
David Woodhouse 已提交
1386 1387
	if (ret == -EOPNOTSUPP)
		ret = 0;
1388 1389 1390
	return ret;
}

1391
/* Can return -ENOMEM */
1392
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1393
			 struct btrfs_ref *generic_ref)
1394
{
1395
	struct btrfs_fs_info *fs_info = trans->fs_info;
1396
	int old_ref_mod, new_ref_mod;
1397
	int ret;
A
Arne Jansen 已提交
1398

1399 1400 1401 1402
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1403

1404 1405
	if (generic_ref->type == BTRFS_REF_METADATA)
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1406
				NULL, &old_ref_mod, &new_ref_mod);
1407 1408
	else
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1409 1410
						 &old_ref_mod, &new_ref_mod);

1411
	btrfs_ref_tree_mod(fs_info, generic_ref);
1412

1413
	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1414
		sub_pinned_bytes(fs_info, generic_ref);
1415

1416 1417 1418
	return ret;
}

1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1453
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1454
				  struct btrfs_delayed_ref_node *node,
1455 1456 1457 1458 1459 1460 1461
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1462
	struct btrfs_key key;
1463 1464
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1465 1466 1467 1468 1469 1470 1471
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1472
	path->reada = READA_FORWARD;
1473 1474
	path->leave_spinning = 1;
	/* this will setup the path even if it fails to insert the back ref */
1475 1476 1477
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1478
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1479
		goto out;
J
Josef Bacik 已提交
1480 1481 1482 1483 1484 1485

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1486
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1487
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1488 1489 1490 1491 1492
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1493

1494
	btrfs_mark_buffer_dirty(leaf);
1495
	btrfs_release_path(path);
1496

1497
	path->reada = READA_FORWARD;
1498
	path->leave_spinning = 1;
1499
	/* now insert the actual backref */
1500 1501
	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
				    owner, offset, refs_to_add);
1502
	if (ret)
1503
		btrfs_abort_transaction(trans, ret);
1504
out:
1505
	btrfs_free_path(path);
1506
	return ret;
1507 1508
}

1509 1510 1511 1512
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1513
{
1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1526
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1527

1528 1529
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1530
	ref_root = ref->root;
1531 1532

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1533
		if (extent_op)
1534
			flags |= extent_op->flags_to_set;
1535 1536 1537 1538
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1539
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1540 1541 1542
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1543
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1544
		ret = __btrfs_free_extent(trans, node, parent,
1545 1546
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1547
					  extent_op);
1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1573
				 struct btrfs_delayed_ref_head *head,
1574 1575
				 struct btrfs_delayed_extent_op *extent_op)
{
1576
	struct btrfs_fs_info *fs_info = trans->fs_info;
1577 1578 1579 1580 1581
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1582
	int ret;
1583
	int err = 0;
1584
	int metadata = !extent_op->is_data;
1585

1586 1587 1588
	if (trans->aborted)
		return 0;

1589
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1590 1591
		metadata = 0;

1592 1593 1594 1595
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1596
	key.objectid = head->bytenr;
1597

1598 1599
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1600
		key.offset = extent_op->level;
1601 1602
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1603
		key.offset = head->num_bytes;
1604 1605 1606
	}

again:
1607
	path->reada = READA_FORWARD;
1608
	path->leave_spinning = 1;
1609
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1610 1611 1612 1613 1614
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1615
		if (metadata) {
1616 1617 1618 1619
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1620
				if (key.objectid == head->bytenr &&
1621
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1622
				    key.offset == head->num_bytes)
1623 1624 1625 1626 1627
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1628

1629 1630
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1631 1632 1633 1634 1635 1636
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1637
		}
1638 1639 1640 1641
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1642

1643
	if (unlikely(item_size < sizeof(*ei))) {
1644 1645 1646 1647 1648 1649
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1650 1651
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1652

1653 1654 1655 1656
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1657 1658
}

1659 1660 1661 1662
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1663 1664
{
	int ret = 0;
1665 1666 1667
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1668

1669
	ref = btrfs_delayed_node_to_tree_ref(node);
1670
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1671

1672 1673
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1674
	ref_root = ref->root;
1675

1676
	if (node->ref_mod != 1) {
1677
		btrfs_err(trans->fs_info,
1678 1679 1680 1681 1682
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1683
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1684
		BUG_ON(!extent_op || !extent_op->update_flags);
1685
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1686
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1687 1688
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1689
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1690
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1691
					  ref->level, 0, 1, extent_op);
1692 1693 1694
	} else {
		BUG();
	}
1695 1696 1697 1698
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1699 1700 1701 1702
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1703
{
1704 1705
	int ret = 0;

1706 1707
	if (trans->aborted) {
		if (insert_reserved)
1708
			btrfs_pin_extent(trans->fs_info, node->bytenr,
1709
					 node->num_bytes, 1);
1710
		return 0;
1711
	}
1712

1713 1714
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1715
		ret = run_delayed_tree_ref(trans, node, extent_op,
1716 1717 1718
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1719
		ret = run_delayed_data_ref(trans, node, extent_op,
1720 1721 1722
					   insert_reserved);
	else
		BUG();
1723 1724 1725
	if (ret && insert_reserved)
		btrfs_pin_extent(trans->fs_info, node->bytenr,
				 node->num_bytes, 1);
1726
	return ret;
1727 1728
}

1729
static inline struct btrfs_delayed_ref_node *
1730 1731
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1732 1733
	struct btrfs_delayed_ref_node *ref;

1734
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1735
		return NULL;
1736

1737 1738 1739 1740 1741 1742
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1743 1744 1745 1746
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1747
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1748
		       struct btrfs_delayed_ref_node, ref_node);
1749 1750
	ASSERT(list_empty(&ref->add_list));
	return ref;
1751 1752
}

1753 1754 1755 1756 1757 1758 1759 1760 1761 1762
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1763 1764
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1765 1766 1767 1768
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1769 1770
		return NULL;

1771
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1772
		head->extent_op = NULL;
1773
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1774
		return NULL;
1775
	}
J
Josef Bacik 已提交
1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1789
	spin_unlock(&head->lock);
1790
	ret = run_delayed_extent_op(trans, head, extent_op);
1791 1792 1793 1794
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1795 1796 1797
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1798
{
J
Josef Bacik 已提交
1799
	int nr_items = 1;	/* Dropping this ref head update. */
1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810

	if (head->total_ref_mod < 0) {
		struct btrfs_space_info *space_info;
		u64 flags;

		if (head->is_data)
			flags = BTRFS_BLOCK_GROUP_DATA;
		else if (head->is_system)
			flags = BTRFS_BLOCK_GROUP_SYSTEM;
		else
			flags = BTRFS_BLOCK_GROUP_METADATA;
1811
		space_info = btrfs_find_space_info(fs_info, flags);
1812 1813 1814 1815 1816
		ASSERT(space_info);
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
				   -head->num_bytes,
				   BTRFS_TOTAL_BYTES_PINNED_BATCH);

J
Josef Bacik 已提交
1817 1818 1819 1820 1821
		/*
		 * We had csum deletions accounted for in our delayed refs rsv,
		 * we need to drop the csum leaves for this update from our
		 * delayed_refs_rsv.
		 */
1822 1823 1824 1825
		if (head->is_data) {
			spin_lock(&delayed_refs->lock);
			delayed_refs->pending_csums -= head->num_bytes;
			spin_unlock(&delayed_refs->lock);
J
Josef Bacik 已提交
1826 1827
			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
				head->num_bytes);
1828 1829 1830
		}
	}

J
Josef Bacik 已提交
1831
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1832 1833
}

1834 1835 1836
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1837 1838

	struct btrfs_fs_info *fs_info = trans->fs_info;
1839 1840 1841 1842 1843
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1844
	ret = run_and_cleanup_extent_op(trans, head);
1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1860
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1861 1862 1863 1864
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1865
	btrfs_delete_ref_head(delayed_refs, head);
1866
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1867
	spin_unlock(&delayed_refs->lock);
1868 1869

	if (head->must_insert_reserved) {
1870 1871
		btrfs_pin_extent(fs_info, head->bytenr,
				 head->num_bytes, 1);
1872
		if (head->is_data) {
1873 1874
			ret = btrfs_del_csums(trans, fs_info->csum_root,
					      head->bytenr, head->num_bytes);
1875 1876 1877
		}
	}

1878
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1879 1880

	trace_run_delayed_ref_head(fs_info, head, 0);
1881
	btrfs_delayed_ref_unlock(head);
1882
	btrfs_put_delayed_ref_head(head);
1883 1884 1885
	return 0;
}

1886 1887 1888 1889 1890 1891 1892 1893 1894
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1895
	head = btrfs_select_ref_head(delayed_refs);
1896 1897 1898 1899 1900 1901 1902 1903 1904
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1905
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1932 1933 1934
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1999 2000 2001 2002
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
2003 2004
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
2005
{
2006
	struct btrfs_fs_info *fs_info = trans->fs_info;
2007 2008
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
2009
	ktime_t start = ktime_get();
2010
	int ret;
2011
	unsigned long count = 0;
2012
	unsigned long actual_count = 0;
2013 2014

	delayed_refs = &trans->transaction->delayed_refs;
2015
	do {
2016
		if (!locked_ref) {
2017
			locked_ref = btrfs_obtain_ref_head(trans);
2018 2019 2020 2021 2022 2023
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2024
			}
2025
			count++;
2026
		}
2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2039
		spin_lock(&locked_ref->lock);
2040
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2041

2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2055
			ret = cleanup_ref_head(trans, locked_ref);
2056
			if (ret > 0 ) {
2057 2058
				/* We dropped our lock, we need to loop. */
				ret = 0;
2059
				continue;
2060 2061
			} else if (ret) {
				return ret;
2062
			}
2063
		}
2064

2065
		/*
2066 2067
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2068 2069
		 */

2070
		locked_ref = NULL;
2071
		cond_resched();
2072
	} while ((nr != -1 && count < nr) || locked_ref);
2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2089
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2090 2091
		spin_unlock(&delayed_refs->lock);
	}
2092
	return 0;
2093 2094
}

2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2138
static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2139 2140 2141 2142 2143
{
	u64 num_bytes;

	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
			     sizeof(struct btrfs_extent_inline_ref));
2144
	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2145 2146 2147 2148
		num_bytes += heads * sizeof(struct btrfs_tree_block_info);

	/*
	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2149
	 * closer to what we're really going to want to use.
2150
	 */
2151
	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2152 2153
}

2154 2155 2156 2157
/*
 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 * would require to store the csums for that many bytes.
 */
2158
u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2159 2160 2161 2162 2163
{
	u64 csum_size;
	u64 num_csums_per_leaf;
	u64 num_csums;

2164
	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2165
	num_csums_per_leaf = div64_u64(csum_size,
2166 2167
			(u64)btrfs_super_csum_size(fs_info->super_copy));
	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2168 2169 2170 2171 2172
	num_csums += num_csums_per_leaf - 1;
	num_csums = div64_u64(num_csums, num_csums_per_leaf);
	return num_csums;
}

2173 2174 2175 2176 2177 2178
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2179 2180 2181
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2182 2183
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2184
			   unsigned long count)
2185
{
2186
	struct btrfs_fs_info *fs_info = trans->fs_info;
2187 2188
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2189
	struct btrfs_delayed_ref_head *head;
2190 2191 2192
	int ret;
	int run_all = count == (unsigned long)-1;

2193 2194 2195 2196
	/* We'll clean this up in btrfs_cleanup_transaction */
	if (trans->aborted)
		return 0;

2197
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2198 2199
		return 0;

2200
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2201
	if (count == 0)
2202
		count = atomic_read(&delayed_refs->num_entries) * 2;
2203

2204
again:
2205 2206 2207
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2208
	ret = __btrfs_run_delayed_refs(trans, count);
2209
	if (ret < 0) {
2210
		btrfs_abort_transaction(trans, ret);
2211
		return ret;
2212
	}
2213

2214
	if (run_all) {
2215
		btrfs_create_pending_block_groups(trans);
2216

2217
		spin_lock(&delayed_refs->lock);
2218
		node = rb_first_cached(&delayed_refs->href_root);
2219 2220
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2221
			goto out;
2222
		}
2223 2224 2225 2226
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2227

2228 2229 2230
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2231

2232
		btrfs_put_delayed_ref_head(head);
2233
		cond_resched();
2234
		goto again;
2235
	}
2236
out:
2237 2238 2239
	return 0;
}

2240 2241
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				u64 bytenr, u64 num_bytes, u64 flags,
2242
				int level, int is_data)
2243 2244 2245 2246
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2247
	extent_op = btrfs_alloc_delayed_extent_op();
2248 2249 2250 2251
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2252 2253 2254
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2255
	extent_op->level = level;
2256

2257
	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2258
	if (ret)
2259
		btrfs_free_delayed_extent_op(extent_op);
2260 2261 2262
	return ret;
}

2263
static noinline int check_delayed_ref(struct btrfs_root *root,
2264 2265 2266 2267 2268 2269 2270
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2271
	struct btrfs_transaction *cur_trans;
2272
	struct rb_node *node;
2273 2274
	int ret = 0;

2275
	spin_lock(&root->fs_info->trans_lock);
2276
	cur_trans = root->fs_info->running_transaction;
2277 2278 2279
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2280 2281 2282 2283
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2284
	spin_lock(&delayed_refs->lock);
2285
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2286 2287
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2288
		btrfs_put_transaction(cur_trans);
2289 2290
		return 0;
	}
2291 2292

	if (!mutex_trylock(&head->mutex)) {
2293
		refcount_inc(&head->refs);
2294 2295
		spin_unlock(&delayed_refs->lock);

2296
		btrfs_release_path(path);
2297

2298 2299 2300 2301
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2302 2303
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2304
		btrfs_put_delayed_ref_head(head);
2305
		btrfs_put_transaction(cur_trans);
2306 2307
		return -EAGAIN;
	}
2308
	spin_unlock(&delayed_refs->lock);
2309

2310
	spin_lock(&head->lock);
2311 2312 2313 2314
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2315 2316
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2317
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2318 2319 2320 2321 2322
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2323

2324
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2325

2326 2327 2328 2329 2330 2331 2332 2333 2334 2335
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2336
	}
2337
	spin_unlock(&head->lock);
2338
	mutex_unlock(&head->mutex);
2339
	btrfs_put_transaction(cur_trans);
2340 2341 2342
	return ret;
}

2343
static noinline int check_committed_ref(struct btrfs_root *root,
2344 2345
					struct btrfs_path *path,
					u64 objectid, u64 offset, u64 bytenr)
2346
{
2347 2348
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
2349
	struct extent_buffer *leaf;
2350 2351 2352
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2353
	struct btrfs_key key;
2354
	u32 item_size;
2355
	int type;
2356
	int ret;
2357

2358
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2359
	key.offset = (u64)-1;
2360
	key.type = BTRFS_EXTENT_ITEM_KEY;
2361 2362 2363 2364

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2365
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2366 2367 2368

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2369
		goto out;
2370

Z
Zheng Yan 已提交
2371
	path->slots[0]--;
2372
	leaf = path->nodes[0];
2373
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2374

2375
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2376
		goto out;
2377

2378 2379 2380
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2381

2382
	/* If extent item has more than 1 inline ref then it's shared */
2383 2384 2385
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2386

2387
	/* If extent created before last snapshot => it's definitely shared */
2388 2389 2390 2391 2392
	if (btrfs_extent_generation(leaf, ei) <=
	    btrfs_root_last_snapshot(&root->root_item))
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2393

2394
	/* If this extent has SHARED_DATA_REF then it's shared */
2395 2396
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2413 2414
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
			  u64 bytenr)
2415 2416 2417 2418 2419 2420
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2421
		return -ENOMEM;
2422 2423

	do {
2424
		ret = check_committed_ref(root, path, objectid,
2425 2426
					  offset, bytenr);
		if (ret && ret != -ENOENT)
2427
			goto out;
Y
Yan Zheng 已提交
2428

2429 2430
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2431

2432
out:
Y
Yan Zheng 已提交
2433
	btrfs_free_path(path);
2434 2435
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
2436
	return ret;
2437
}
C
Chris Mason 已提交
2438

2439
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2440
			   struct btrfs_root *root,
2441
			   struct extent_buffer *buf,
2442
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2443
{
2444
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2445
	u64 bytenr;
2446 2447
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2448 2449 2450 2451
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2452 2453
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2454
	int i;
2455
	int action;
Z
Zheng Yan 已提交
2456 2457
	int level;
	int ret = 0;
2458

2459
	if (btrfs_is_testing(fs_info))
2460
		return 0;
2461

Z
Zheng Yan 已提交
2462 2463 2464 2465
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2466
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2467
		return 0;
Z
Zheng Yan 已提交
2468

2469 2470 2471 2472
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2473 2474 2475 2476
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2477 2478

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2479
		if (level == 0) {
2480
			btrfs_item_key_to_cpu(buf, &key, i);
2481
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2482
				continue;
2483
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2484 2485 2486 2487 2488 2489 2490
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2491 2492 2493

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2494 2495 2496 2497 2498 2499
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
					    key.offset);
			generic_ref.skip_qgroup = for_reloc;
2500
			if (inc)
2501
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2502
			else
2503
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2504 2505 2506
			if (ret)
				goto fail;
		} else {
2507
			bytenr = btrfs_node_blockptr(buf, i);
2508
			num_bytes = fs_info->nodesize;
2509 2510 2511 2512 2513
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
			generic_ref.skip_qgroup = for_reloc;
2514
			if (inc)
2515
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2516
			else
2517
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2518 2519 2520 2521 2522 2523
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2524 2525 2526 2527
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2528
		  struct extent_buffer *buf, int full_backref)
2529
{
2530
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2531 2532 2533
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2534
		  struct extent_buffer *buf, int full_backref)
2535
{
2536
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2537 2538
}

2539
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2540
{
2541
	struct btrfs_block_group *block_group;
2542 2543
	int readonly = 0;

2544
	block_group = btrfs_lookup_block_group(fs_info, bytenr);
2545 2546 2547
	if (!block_group || block_group->ro)
		readonly = 1;
	if (block_group)
2548
		btrfs_put_block_group(block_group);
2549 2550 2551
	return readonly;
}

2552
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2553
{
2554
	struct btrfs_fs_info *fs_info = root->fs_info;
2555
	u64 flags;
D
David Woodhouse 已提交
2556
	u64 ret;
J
Josef Bacik 已提交
2557

2558 2559
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2560
	else if (root == fs_info->chunk_root)
2561
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2562
	else
2563
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2564

2565
	ret = btrfs_get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2566
	return ret;
J
Josef Bacik 已提交
2567
}
J
Josef Bacik 已提交
2568

2569
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2570
{
2571
	struct btrfs_block_group *cache;
2572
	u64 bytenr;
J
Josef Bacik 已提交
2573

2574 2575 2576
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2577 2578 2579 2580

	if (bytenr < (u64)-1)
		return bytenr;

2581
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2582
	if (!cache)
2583
		return 0;
J
Josef Bacik 已提交
2584

2585
	bytenr = cache->start;
2586
	btrfs_put_block_group(cache);
2587 2588

	return bytenr;
2589 2590
}

2591
static int pin_down_extent(struct btrfs_block_group *cache,
2592
			   u64 bytenr, u64 num_bytes, int reserved)
2593
{
2594 2595
	struct btrfs_fs_info *fs_info = cache->fs_info;

2596 2597 2598
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2599 2600
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2601 2602 2603 2604 2605 2606
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2607

2608 2609
	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2610
	set_extent_dirty(fs_info->pinned_extents, bytenr,
2611 2612 2613
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2614

2615
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2616 2617
		     u64 bytenr, u64 num_bytes, int reserved)
{
2618
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
2619

2620 2621
	ASSERT(fs_info->running_transaction);

2622
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2623
	BUG_ON(!cache); /* Logic error */
2624

2625
	pin_down_extent(cache, bytenr, num_bytes, reserved);
2626 2627

	btrfs_put_block_group(cache);
2628 2629 2630
	return 0;
}

2631
/*
2632 2633
 * this function must be called within transaction
 */
2634
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2635 2636
				    u64 bytenr, u64 num_bytes)
{
2637
	struct btrfs_block_group *cache;
2638
	int ret;
2639

2640
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2641 2642
	if (!cache)
		return -EINVAL;
2643 2644 2645 2646 2647 2648 2649

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2650
	btrfs_cache_block_group(cache, 1);
2651

2652
	pin_down_extent(cache, bytenr, num_bytes, 0);
2653 2654

	/* remove us from the free space cache (if we're there at all) */
2655
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2656
	btrfs_put_block_group(cache);
2657
	return ret;
2658 2659
}

2660 2661
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2662 2663
{
	int ret;
2664
	struct btrfs_block_group *block_group;
2665 2666
	struct btrfs_caching_control *caching_ctl;

2667
	block_group = btrfs_lookup_block_group(fs_info, start);
2668 2669 2670
	if (!block_group)
		return -EINVAL;

2671
	btrfs_cache_block_group(block_group, 0);
2672
	caching_ctl = btrfs_get_caching_control(block_group);
2673 2674 2675

	if (!caching_ctl) {
		/* Logic error */
2676
		BUG_ON(!btrfs_block_group_done(block_group));
2677 2678 2679 2680 2681
		ret = btrfs_remove_free_space(block_group, start, num_bytes);
	} else {
		mutex_lock(&caching_ctl->mutex);

		if (start >= caching_ctl->progress) {
2682 2683
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696
		} else if (start + num_bytes <= caching_ctl->progress) {
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
		} else {
			num_bytes = caching_ctl->progress - start;
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
			if (ret)
				goto out_lock;

			num_bytes = (start + num_bytes) -
				caching_ctl->progress;
			start = caching_ctl->progress;
2697 2698
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2699 2700 2701
		}
out_lock:
		mutex_unlock(&caching_ctl->mutex);
2702
		btrfs_put_caching_control(caching_ctl);
2703 2704 2705 2706 2707
	}
	btrfs_put_block_group(block_group);
	return ret;
}

2708
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2709
{
2710
	struct btrfs_fs_info *fs_info = eb->fs_info;
2711 2712 2713 2714
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2715
	int ret = 0;
2716

2717
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2732 2733 2734
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2735 2736
	}

2737
	return ret;
2738 2739
}

2740
static void
2741
btrfs_inc_block_group_reservations(struct btrfs_block_group *bg)
2742 2743 2744 2745
{
	atomic_inc(&bg->reservations);
}

2746
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
2747
{
2748 2749
	struct btrfs_caching_control *next;
	struct btrfs_caching_control *caching_ctl;
2750
	struct btrfs_block_group *cache;
2751

2752
	down_write(&fs_info->commit_root_sem);
2753

2754 2755 2756
	list_for_each_entry_safe(caching_ctl, next,
				 &fs_info->caching_block_groups, list) {
		cache = caching_ctl->block_group;
2757
		if (btrfs_block_group_done(cache)) {
2758 2759
			cache->last_byte_to_unpin = (u64)-1;
			list_del_init(&caching_ctl->list);
2760
			btrfs_put_caching_control(caching_ctl);
2761
		} else {
2762
			cache->last_byte_to_unpin = caching_ctl->progress;
2763 2764
		}
	}
2765 2766 2767 2768 2769 2770

	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		fs_info->pinned_extents = &fs_info->freed_extents[1];
	else
		fs_info->pinned_extents = &fs_info->freed_extents[0];

2771
	up_write(&fs_info->commit_root_sem);
2772

2773
	btrfs_update_global_block_rsv(fs_info);
2774 2775
}

2776 2777 2778 2779 2780
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2781 2782
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2783 2784 2785 2786 2787 2788 2789 2790
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2791
		ret = &fs_info->meta_alloc_cluster;
2792 2793 2794
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2795
			*empty_cluster = SZ_64K;
2796 2797 2798
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2799
		ret = &fs_info->data_alloc_cluster;
2800 2801 2802 2803 2804
	}

	return ret;
}

2805 2806
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2807
			      const bool return_free_space)
C
Chris Mason 已提交
2808
{
2809
	struct btrfs_block_group *cache = NULL;
2810 2811
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2812
	struct btrfs_free_cluster *cluster = NULL;
2813
	u64 len;
2814 2815
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2816
	bool readonly;
C
Chris Mason 已提交
2817

2818
	while (start <= end) {
2819
		readonly = false;
2820
		if (!cache ||
2821
		    start >= cache->start + cache->length) {
2822 2823
			if (cache)
				btrfs_put_block_group(cache);
2824
			total_unpinned = 0;
2825
			cache = btrfs_lookup_block_group(fs_info, start);
2826
			BUG_ON(!cache); /* Logic error */
2827

2828
			cluster = fetch_cluster_info(fs_info,
2829 2830 2831
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2832 2833
		}

2834
		len = cache->start + cache->length - start;
2835 2836 2837 2838
		len = min(len, end + 1 - start);

		if (start < cache->last_byte_to_unpin) {
			len = min(len, cache->last_byte_to_unpin - start);
2839 2840
			if (return_free_space)
				btrfs_add_free_space(cache, start, len);
2841 2842
		}

2843
		start += len;
2844
		total_unpinned += len;
2845
		space_info = cache->space_info;
2846

2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2860
		spin_lock(&space_info->lock);
2861 2862
		spin_lock(&cache->lock);
		cache->pinned -= len;
2863
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2864
		space_info->max_extent_size = 0;
2865 2866
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2867 2868 2869 2870
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
		}
2871
		spin_unlock(&cache->lock);
2872 2873 2874
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2875

2876 2877
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2878 2879 2880
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2881 2882
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2883 2884
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2885
				len -= to_add;
2886 2887
			}
			spin_unlock(&global_rsv->lock);
2888 2889
			/* Add to any tickets we may have */
			if (len)
2890 2891
				btrfs_try_granting_tickets(fs_info,
							   space_info);
2892 2893
		}
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2894
	}
2895 2896 2897

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
2898 2899 2900
	return 0;
}

2901
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2902
{
2903
	struct btrfs_fs_info *fs_info = trans->fs_info;
2904
	struct btrfs_block_group *block_group, *tmp;
2905
	struct list_head *deleted_bgs;
2906
	struct extent_io_tree *unpin;
2907 2908
	u64 start;
	u64 end;
2909 2910
	int ret;

2911 2912 2913 2914 2915
	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		unpin = &fs_info->freed_extents[1];
	else
		unpin = &fs_info->freed_extents[0];

2916
	while (!trans->aborted) {
2917 2918
		struct extent_state *cached_state = NULL;

2919
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2920
		ret = find_first_extent_bit(unpin, 0, &start, &end,
2921
					    EXTENT_DIRTY, &cached_state);
2922 2923
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2924
			break;
2925
		}
2926

2927
		if (btrfs_test_opt(fs_info, DISCARD_SYNC))
2928
			ret = btrfs_discard_extent(fs_info, start,
2929
						   end + 1 - start, NULL);
2930

2931
		clear_extent_dirty(unpin, start, end, &cached_state);
2932
		unpin_extent_range(fs_info, start, end, true);
2933
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2934
		free_extent_state(cached_state);
2935
		cond_resched();
2936
	}
J
Josef Bacik 已提交
2937

2938 2939
	if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
		btrfs_discard_calc_delay(&fs_info->discard_ctl);
2940
		btrfs_discard_schedule_work(&fs_info->discard_ctl, true);
2941
	}
2942

2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
		if (!trans->aborted)
2954
			ret = btrfs_discard_extent(fs_info,
2955 2956
						   block_group->start,
						   block_group->length,
2957 2958 2959 2960 2961 2962 2963 2964 2965
						   &trimmed);

		list_del_init(&block_group->bg_list);
		btrfs_put_block_group_trimming(block_group);
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
2966
			   "discard failed while removing blockgroup: errno=%d %s",
2967 2968 2969 2970
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
2971 2972 2973
	return 0;
}

2974
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2975 2976 2977 2978
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
2979
{
2980
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
2981
	struct btrfs_key key;
2982
	struct btrfs_path *path;
2983
	struct btrfs_root *extent_root = info->extent_root;
2984
	struct extent_buffer *leaf;
2985 2986
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
2987
	int ret;
2988
	int is_data;
2989 2990 2991
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
2992 2993
	u32 item_size;
	u64 refs;
2994 2995
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
2996
	int last_ref = 0;
2997
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
2998

2999
	path = btrfs_alloc_path();
3000 3001
	if (!path)
		return -ENOMEM;
3002

3003
	path->reada = READA_FORWARD;
3004
	path->leave_spinning = 1;
3005 3006 3007 3008

	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
	BUG_ON(!is_data && refs_to_drop != 1);

3009
	if (is_data)
3010
		skinny_metadata = false;
3011

3012 3013
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
3014
				    owner_offset);
3015
	if (ret == 0) {
3016
		extent_slot = path->slots[0];
3017 3018
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
3019
					      extent_slot);
3020
			if (key.objectid != bytenr)
3021
				break;
3022 3023
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
3024 3025 3026
				found_extent = 1;
				break;
			}
3027 3028 3029 3030 3031
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
3032 3033
			if (path->slots[0] - extent_slot > 5)
				break;
3034
			extent_slot--;
3035
		}
3036

Z
Zheng Yan 已提交
3037
		if (!found_extent) {
3038
			BUG_ON(iref);
3039
			ret = remove_extent_backref(trans, path, NULL,
3040
						    refs_to_drop,
J
Josef Bacik 已提交
3041
						    is_data, &last_ref);
3042
			if (ret) {
3043
				btrfs_abort_transaction(trans, ret);
3044 3045
				goto out;
			}
3046
			btrfs_release_path(path);
3047
			path->leave_spinning = 1;
3048 3049 3050 3051 3052

			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3053 3054 3055 3056 3057
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3058 3059
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3076
				key.objectid = bytenr;
3077 3078 3079 3080 3081 3082 3083
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3084
			if (ret) {
J
Jeff Mahoney 已提交
3085 3086 3087
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3088
				if (ret > 0)
3089
					btrfs_print_leaf(path->nodes[0]);
3090
			}
3091
			if (ret < 0) {
3092
				btrfs_abort_transaction(trans, ret);
3093 3094
				goto out;
			}
Z
Zheng Yan 已提交
3095 3096
			extent_slot = path->slots[0];
		}
3097
	} else if (WARN_ON(ret == -ENOENT)) {
3098
		btrfs_print_leaf(path->nodes[0]);
3099 3100
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3101 3102
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3103
		btrfs_abort_transaction(trans, ret);
3104
		goto out;
3105
	} else {
3106
		btrfs_abort_transaction(trans, ret);
3107
		goto out;
3108
	}
3109 3110

	leaf = path->nodes[0];
3111
	item_size = btrfs_item_size_nr(leaf, extent_slot);
3112
	if (unlikely(item_size < sizeof(*ei))) {
3113 3114 3115 3116 3117
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3118
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3119
			    struct btrfs_extent_item);
3120 3121
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3122 3123 3124 3125 3126
		struct btrfs_tree_block_info *bi;
		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3127

3128
	refs = btrfs_extent_refs(leaf, ei);
3129
	if (refs < refs_to_drop) {
J
Jeff Mahoney 已提交
3130 3131 3132
		btrfs_err(info,
			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
			  refs_to_drop, refs, bytenr);
3133
		ret = -EINVAL;
3134
		btrfs_abort_transaction(trans, ret);
3135 3136
		goto out;
	}
3137
	refs -= refs_to_drop;
3138

3139 3140 3141 3142 3143 3144
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3145
		 */
3146 3147 3148 3149 3150 3151 3152
		if (iref) {
			BUG_ON(!found_extent);
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3153 3154 3155
			ret = remove_extent_backref(trans, path, iref,
						    refs_to_drop, is_data,
						    &last_ref);
3156
			if (ret) {
3157
				btrfs_abort_transaction(trans, ret);
3158 3159
				goto out;
			}
3160
		}
3161 3162 3163
	} else {
		if (found_extent) {
			BUG_ON(is_data && refs_to_drop !=
3164
			       extent_data_ref_count(path, iref));
3165 3166 3167 3168 3169 3170 3171
			if (iref) {
				BUG_ON(path->slots[0] != extent_slot);
			} else {
				BUG_ON(path->slots[0] != extent_slot + 1);
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3172
		}
3173

J
Josef Bacik 已提交
3174
		last_ref = 1;
3175 3176
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3177
		if (ret) {
3178
			btrfs_abort_transaction(trans, ret);
3179 3180
			goto out;
		}
3181
		btrfs_release_path(path);
3182

3183
		if (is_data) {
3184 3185
			ret = btrfs_del_csums(trans, info->csum_root, bytenr,
					      num_bytes);
3186
			if (ret) {
3187
				btrfs_abort_transaction(trans, ret);
3188 3189
				goto out;
			}
3190 3191
		}

3192
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3193
		if (ret) {
3194
			btrfs_abort_transaction(trans, ret);
3195 3196 3197
			goto out;
		}

3198
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3199
		if (ret) {
3200
			btrfs_abort_transaction(trans, ret);
3201 3202
			goto out;
		}
3203
	}
J
Josef Bacik 已提交
3204 3205
	btrfs_release_path(path);

3206
out:
3207
	btrfs_free_path(path);
3208 3209 3210
	return ret;
}

3211
/*
3212
 * when we free an block, it is possible (and likely) that we free the last
3213 3214 3215 3216 3217
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3218
				      u64 bytenr)
3219 3220 3221
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3222
	int ret = 0;
3223 3224 3225

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3226
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3227
	if (!head)
3228
		goto out_delayed_unlock;
3229

3230
	spin_lock(&head->lock);
3231
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3232 3233
		goto out;

J
Josef Bacik 已提交
3234 3235
	if (cleanup_extent_op(head) != NULL)
		goto out;
3236

3237 3238 3239 3240 3241 3242 3243
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3244
	btrfs_delete_ref_head(delayed_refs, head);
3245
	head->processing = 0;
3246

3247
	spin_unlock(&head->lock);
3248 3249
	spin_unlock(&delayed_refs->lock);

3250 3251 3252 3253
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3254
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3255
	mutex_unlock(&head->mutex);
3256
	btrfs_put_delayed_ref_head(head);
3257
	return ret;
3258
out:
3259
	spin_unlock(&head->lock);
3260 3261

out_delayed_unlock:
3262 3263 3264 3265
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3266 3267 3268
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
3269
			   u64 parent, int last_ref)
3270
{
3271
	struct btrfs_fs_info *fs_info = root->fs_info;
3272
	struct btrfs_ref generic_ref = { 0 };
3273
	int pin = 1;
3274 3275
	int ret;

3276 3277 3278 3279 3280
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
			    root->root_key.objectid);

3281
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3282 3283
		int old_ref_mod, new_ref_mod;

3284
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3285
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
3286
						 &old_ref_mod, &new_ref_mod);
3287
		BUG_ON(ret); /* -ENOMEM */
3288
		pin = old_ref_mod >= 0 && new_ref_mod < 0;
3289 3290
	}

3291
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3292
		struct btrfs_block_group *cache;
3293

3294
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3295
			ret = check_ref_cleanup(trans, buf->start);
3296
			if (!ret)
3297
				goto out;
3298 3299
		}

3300
		pin = 0;
3301
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3302

3303
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3304
			pin_down_extent(cache, buf->start, buf->len, 1);
3305
			btrfs_put_block_group(cache);
3306
			goto out;
3307 3308 3309 3310 3311
		}

		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3312
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3313
		btrfs_put_block_group(cache);
3314
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3315 3316
	}
out:
3317
	if (pin)
3318
		add_pinned_bytes(fs_info, &generic_ref);
3319

3320 3321 3322 3323 3324 3325 3326
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3327 3328
}

3329
/* Can return -ENOMEM */
3330
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3331
{
3332
	struct btrfs_fs_info *fs_info = trans->fs_info;
3333
	int old_ref_mod, new_ref_mod;
3334 3335
	int ret;

3336
	if (btrfs_is_testing(fs_info))
3337
		return 0;
3338

3339 3340 3341 3342
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3343 3344 3345 3346
	if ((ref->type == BTRFS_REF_METADATA &&
	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	    (ref->type == BTRFS_REF_DATA &&
	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3347
		/* unlocks the pinned mutex */
3348
		btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
3349
		old_ref_mod = new_ref_mod = 0;
3350
		ret = 0;
3351 3352
	} else if (ref->type == BTRFS_REF_METADATA) {
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
3353
						 &old_ref_mod, &new_ref_mod);
3354
	} else {
3355
		ret = btrfs_add_delayed_data_ref(trans, ref, 0,
3356
						 &old_ref_mod, &new_ref_mod);
3357
	}
3358

3359 3360 3361 3362 3363
	if (!((ref->type == BTRFS_REF_METADATA &&
	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	      (ref->type == BTRFS_REF_DATA &&
	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
		btrfs_ref_tree_mod(fs_info, ref);
3364

3365
	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
3366
		add_pinned_bytes(fs_info, ref);
3367

3368 3369 3370
	return ret;
}

J
Josef Bacik 已提交
3371
enum btrfs_loop_type {
3372 3373 3374 3375
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3376 3377
};

3378
static inline void
3379
btrfs_lock_block_group(struct btrfs_block_group *cache,
3380 3381 3382 3383 3384 3385
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3386
static inline void btrfs_grab_block_group(struct btrfs_block_group *cache,
3387 3388 3389 3390 3391 3392 3393
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

3394 3395
static struct btrfs_block_group *btrfs_lock_cluster(
		   struct btrfs_block_group *block_group,
3396 3397 3398
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
{
3399
	struct btrfs_block_group *used_bg = NULL;
3400

3401
	spin_lock(&cluster->refill_lock);
3402 3403 3404 3405 3406 3407
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3408 3409
			return used_bg;

3410
		btrfs_get_block_group(used_bg);
3411

3412 3413
		if (!delalloc)
			return used_bg;
3414

3415 3416
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3417

3418
		spin_unlock(&cluster->refill_lock);
3419

3420 3421
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3422

3423 3424 3425
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3426

3427 3428 3429
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3430 3431 3432
}

static inline void
3433
btrfs_release_block_group(struct btrfs_block_group *cache,
3434 3435 3436 3437 3438 3439 3440
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
	u64 num_bytes;
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;

	bool have_caching_bg;
	bool orig_have_caching_bg;

	/* RAID index, converted from flags */
	int index;

3464 3465 3466
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
};

3494 3495 3496 3497 3498 3499 3500 3501 3502

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
3503
static int find_free_extent_clustered(struct btrfs_block_group *bg,
3504 3505
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl,
3506
		struct btrfs_block_group **cluster_bg_ret)
3507
{
3508
	struct btrfs_block_group *cluster_bg;
3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3521
			ffe_ctl->num_bytes, cluster_bg->start,
3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3567 3568
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3588
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3602 3603 3604 3605 3606
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
3607
static int find_free_extent_unclustered(struct btrfs_block_group *bg,
3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl)
{
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3655 3656
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3657 3658 3659 3660 3661 3662 3663 3664 3665
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_free_cluster *last_ptr,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
					int full_search, bool use_cluster)
{
	struct btrfs_root *root = fs_info->extent_root;
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	    ffe_ctl->have_caching_bg)
		return 1;

	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
		return 1;

	if (ins->objectid) {
		if (!use_cluster && last_ptr) {
			spin_lock(&last_ptr->lock);
			last_ptr->window_start = ins->objectid;
			spin_unlock(&last_ptr->lock);
		}
		return 0;
	}

	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

3739 3740
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776

			/*
			 * If we can't allocate a new chunk we've already looped
			 * through at least once, move on to the NO_EMPTY_SIZE
			 * case.
			 */
			if (ret == -ENOSPC)
				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;

			/* Do not bail out on ENOSPC since we can do more. */
			if (ret < 0 && ret != -ENOSPC)
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

3777 3778 3779
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
3780
 * ins->objectid == start position
3781
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
3782
 * ins->offset == the size of the hole.
3783
 * Any available blocks before search_start are skipped.
3784 3785 3786
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
3801
 */
3802
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
3803 3804 3805
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
				u64 hint_byte, struct btrfs_key *ins,
				u64 flags, int delalloc)
3806
{
3807
	int ret = 0;
3808
	int cache_block_group_error = 0;
3809
	struct btrfs_free_cluster *last_ptr = NULL;
3810
	struct btrfs_block_group *block_group = NULL;
3811
	struct find_free_extent_ctl ffe_ctl = {0};
3812
	struct btrfs_space_info *space_info;
3813
	bool use_cluster = true;
3814
	bool full_search = false;
3815

3816
	WARN_ON(num_bytes < fs_info->sectorsize);
3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829

	ffe_ctl.num_bytes = num_bytes;
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.search_start = 0;
	ffe_ctl.retry_clustered = false;
	ffe_ctl.retry_unclustered = false;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	ffe_ctl.have_caching_bg = false;
	ffe_ctl.orig_have_caching_bg = false;
	ffe_ctl.found_offset = 0;

3830
	ins->type = BTRFS_EXTENT_ITEM_KEY;
3831 3832
	ins->objectid = 0;
	ins->offset = 0;
3833

3834
	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
3835

3836
	space_info = btrfs_find_space_info(fs_info, flags);
3837
	if (!space_info) {
3838
		btrfs_err(fs_info, "No space info for %llu", flags);
3839 3840
		return -ENOSPC;
	}
J
Josef Bacik 已提交
3841

3842
	/*
3843 3844 3845 3846 3847 3848 3849 3850
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
3851
	 */
3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862
	if (unlikely(space_info->max_extent_size)) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			use_cluster = false;
		}
		spin_unlock(&space_info->lock);
3863
	}
J
Josef Bacik 已提交
3864

3865 3866
	last_ptr = fetch_cluster_info(fs_info, space_info,
				      &ffe_ctl.empty_cluster);
3867
	if (last_ptr) {
3868 3869 3870
		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			hint_byte = last_ptr->window_start;
3871 3872 3873 3874 3875 3876 3877 3878 3879
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			hint_byte = last_ptr->window_start;
			use_cluster = false;
		}
3880
		spin_unlock(&last_ptr->lock);
3881
	}
3882

3883 3884 3885 3886 3887 3888
	ffe_ctl.search_start = max(ffe_ctl.search_start,
				   first_logical_byte(fs_info, 0));
	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
	if (ffe_ctl.search_start == hint_byte) {
		block_group = btrfs_lookup_block_group(fs_info,
						       ffe_ctl.search_start);
J
Josef Bacik 已提交
3889 3890 3891
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
3892 3893 3894
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
3895
		 */
3896
		if (block_group && block_group_bits(block_group, flags) &&
3897
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
3898
			down_read(&space_info->groups_sem);
3899 3900 3901 3902 3903 3904 3905 3906 3907 3908
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
3909
			} else {
3910
				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3911
						block_group->flags);
3912
				btrfs_lock_block_group(block_group, delalloc);
3913
				goto have_block_group;
3914
			}
J
Josef Bacik 已提交
3915
		} else if (block_group) {
3916
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
3917
		}
3918
	}
J
Josef Bacik 已提交
3919
search:
3920 3921 3922
	ffe_ctl.have_caching_bg = false;
	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
	    ffe_ctl.index == 0)
3923
		full_search = true;
3924
	down_read(&space_info->groups_sem);
3925 3926
	list_for_each_entry(block_group,
			    &space_info->block_groups[ffe_ctl.index], list) {
3927 3928 3929 3930
		/* If the block group is read-only, we can skip it entirely. */
		if (unlikely(block_group->ro))
			continue;

3931
		btrfs_grab_block_group(block_group, delalloc);
3932
		ffe_ctl.search_start = block_group->start;
3933

3934 3935 3936 3937 3938
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
3939
		if (!block_group_bits(block_group, flags)) {
3940
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
3941
				BTRFS_BLOCK_GROUP_RAID1_MASK |
3942
				BTRFS_BLOCK_GROUP_RAID56_MASK |
3943 3944 3945 3946 3947 3948 3949
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
3950
			if ((flags & extra) && !(block_group->flags & extra))
3951
				goto loop;
3952 3953 3954 3955 3956 3957 3958 3959

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
			btrfs_release_block_group(block_group, delalloc);
			continue;
3960 3961
		}

J
Josef Bacik 已提交
3962
have_block_group:
3963
		ffe_ctl.cached = btrfs_block_group_done(block_group);
3964 3965
		if (unlikely(!ffe_ctl.cached)) {
			ffe_ctl.have_caching_bg = true;
3966
			ret = btrfs_cache_block_group(block_group, 0);
3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980

			/*
			 * If we get ENOMEM here or something else we want to
			 * try other block groups, because it may not be fatal.
			 * However if we can't find anything else we need to
			 * save our return here so that we return the actual
			 * error that caused problems, not ENOSPC.
			 */
			if (ret < 0) {
				if (!cache_block_group_error)
					cache_block_group_error = ret;
				ret = 0;
				goto loop;
			}
3981
			ret = 0;
J
Josef Bacik 已提交
3982 3983
		}

3984 3985
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
3986

3987
		/*
3988 3989
		 * Ok we want to try and use the cluster allocator, so
		 * lets look there
3990
		 */
3991
		if (last_ptr && use_cluster) {
3992
			struct btrfs_block_group *cluster_bg = NULL;
3993

3994 3995
			ret = find_free_extent_clustered(block_group, last_ptr,
							 &ffe_ctl, &cluster_bg);
3996

3997
			if (ret == 0) {
3998 3999 4000 4001
				if (cluster_bg && cluster_bg != block_group) {
					btrfs_release_block_group(block_group,
								  delalloc);
					block_group = cluster_bg;
4002
				}
4003 4004
				goto checks;
			} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
4005
				goto have_block_group;
4006 4007
			} else if (ret > 0) {
				goto loop;
4008
			}
4009
			/* ret == -ENOENT case falls through */
4010 4011
		}

4012 4013 4014
		ret = find_free_extent_unclustered(block_group, last_ptr,
						   &ffe_ctl);
		if (ret == -EAGAIN)
J
Josef Bacik 已提交
4015
			goto have_block_group;
4016
		else if (ret > 0)
4017
			goto loop;
4018
		/* ret == 0 case falls through */
4019
checks:
4020 4021
		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
					     fs_info->stripesize);
4022

J
Josef Bacik 已提交
4023
		/* move on to the next group */
4024
		if (ffe_ctl.search_start + num_bytes >
4025
		    block_group->start + block_group->length) {
4026 4027
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
4028
			goto loop;
4029
		}
4030

4031 4032 4033
		if (ffe_ctl.found_offset < ffe_ctl.search_start)
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				ffe_ctl.search_start - ffe_ctl.found_offset);
J
Josef Bacik 已提交
4034

4035 4036
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
4037
		if (ret == -EAGAIN) {
4038 4039
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
4040
			goto loop;
J
Josef Bacik 已提交
4041
		}
4042
		btrfs_inc_block_group_reservations(block_group);
4043

4044
		/* we are all good, lets return */
4045
		ins->objectid = ffe_ctl.search_start;
J
Josef Bacik 已提交
4046
		ins->offset = num_bytes;
4047

4048 4049
		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
					   num_bytes);
4050
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
4051 4052
		break;
loop:
4053 4054
		ffe_ctl.retry_clustered = false;
		ffe_ctl.retry_unclustered = false;
4055
		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4056
		       ffe_ctl.index);
4057
		btrfs_release_block_group(block_group, delalloc);
4058
		cond_resched();
J
Josef Bacik 已提交
4059 4060 4061
	}
	up_read(&space_info->groups_sem);

4062 4063 4064
	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
					   full_search, use_cluster);
	if (ret > 0)
4065 4066
		goto search;

4067
	if (ret == -ENOSPC && !cache_block_group_error) {
4068 4069 4070 4071 4072 4073
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
		if (!ffe_ctl.max_extent_size)
			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4074
		spin_lock(&space_info->lock);
4075
		space_info->max_extent_size = ffe_ctl.max_extent_size;
4076
		spin_unlock(&space_info->lock);
4077
		ins->offset = ffe_ctl.max_extent_size;
4078 4079
	} else if (ret == -ENOSPC) {
		ret = cache_block_group_error;
4080
	}
C
Chris Mason 已提交
4081
	return ret;
4082
}
4083

4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4129
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4130 4131
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4132
			 struct btrfs_key *ins, int is_data, int delalloc)
4133
{
4134
	struct btrfs_fs_info *fs_info = root->fs_info;
4135
	bool final_tried = num_bytes == min_alloc_size;
4136
	u64 flags;
4137
	int ret;
4138

4139
	flags = get_alloc_profile_by_root(root, is_data);
4140
again:
4141
	WARN_ON(num_bytes < fs_info->sectorsize);
4142
	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
4143
			       hint_byte, ins, flags, delalloc);
4144
	if (!ret && !is_data) {
4145
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4146
	} else if (ret == -ENOSPC) {
4147 4148
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4149
			num_bytes = round_down(num_bytes,
4150
					       fs_info->sectorsize);
4151
			num_bytes = max(num_bytes, min_alloc_size);
4152
			ram_bytes = num_bytes;
4153 4154 4155
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4156
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4157 4158
			struct btrfs_space_info *sinfo;

4159
			sinfo = btrfs_find_space_info(fs_info, flags);
4160
			btrfs_err(fs_info,
J
Jeff Mahoney 已提交
4161 4162
				  "allocation failed flags %llu, wanted %llu",
				  flags, num_bytes);
4163
			if (sinfo)
4164 4165
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4166
		}
4167
	}
J
Josef Bacik 已提交
4168 4169

	return ret;
4170 4171
}

4172 4173
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
			       u64 start, u64 len, int delalloc)
4174
{
4175
	struct btrfs_block_group *cache;
J
Josef Bacik 已提交
4176

4177
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4178
	if (!cache) {
4179 4180
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4181 4182
		return -ENOSPC;
	}
4183

4184 4185 4186 4187
	btrfs_add_free_space(cache, start, len);
	btrfs_free_reserved_bytes(cache, len, delalloc);
	trace_btrfs_reserved_extent_free(fs_info, start, len);

4188
	btrfs_put_block_group(cache);
4189
	return 0;
4190 4191
}

4192
int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
4193
{
4194
	struct btrfs_block_group *cache;
4195
	int ret = 0;
4196 4197 4198 4199 4200 4201 4202

	cache = btrfs_lookup_block_group(fs_info, start);
	if (!cache) {
		btrfs_err(fs_info, "unable to find block group for %llu", start);
		return -ENOSPC;
	}

4203
	ret = pin_down_extent(cache, start, len, 1);
4204
	btrfs_put_block_group(cache);
4205
	return ret;
4206 4207
}

4208 4209 4210 4211
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4212
{
4213
	struct btrfs_fs_info *fs_info = trans->fs_info;
4214 4215
	int ret;
	struct btrfs_extent_item *extent_item;
4216
	struct btrfs_extent_inline_ref *iref;
4217
	struct btrfs_path *path;
4218 4219 4220
	struct extent_buffer *leaf;
	int type;
	u32 size;
4221

4222 4223 4224 4225
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4226

4227
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4228 4229

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4230 4231
	if (!path)
		return -ENOMEM;
4232

4233
	path->leave_spinning = 1;
4234 4235
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
4236 4237 4238 4239
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4240

4241 4242
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4243
				     struct btrfs_extent_item);
4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4264 4265

	btrfs_mark_buffer_dirty(path->nodes[0]);
4266
	btrfs_free_path(path);
4267

4268
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4269 4270 4271
	if (ret)
		return ret;

4272
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4273
	if (ret) { /* -ENOENT, logic error */
4274
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4275
			ins->objectid, ins->offset);
4276 4277
		BUG();
	}
4278
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4279 4280 4281
	return ret;
}

4282
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4283
				     struct btrfs_delayed_ref_node *node,
4284
				     struct btrfs_delayed_extent_op *extent_op)
4285
{
4286
	struct btrfs_fs_info *fs_info = trans->fs_info;
4287
	int ret;
4288
	struct btrfs_extent_item *extent_item;
4289
	struct btrfs_key extent_key;
4290 4291 4292 4293
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4294
	struct btrfs_delayed_tree_ref *ref;
4295
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4296
	u64 num_bytes;
4297
	u64 flags = extent_op->flags_to_set;
4298
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4299

4300 4301 4302 4303 4304 4305 4306 4307 4308 4309
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4310
		size += sizeof(*block_info);
4311 4312
		num_bytes = node->num_bytes;
	}
4313

4314
	path = btrfs_alloc_path();
4315
	if (!path)
4316
		return -ENOMEM;
4317

4318 4319
	path->leave_spinning = 1;
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4320
				      &extent_key, size);
4321
	if (ret) {
4322
		btrfs_free_path(path);
4323 4324
		return ret;
	}
4325 4326 4327 4328 4329 4330 4331 4332 4333

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4334 4335 4336 4337
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4338
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4339
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4340 4341
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4342

4343
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4344 4345 4346
		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4347
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4348 4349 4350
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4351
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4352 4353 4354 4355 4356
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4357 4358
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4359 4360 4361
	if (ret)
		return ret;

4362 4363
	ret = btrfs_update_block_group(trans, extent_key.objectid,
				       fs_info->nodesize, 1);
4364
	if (ret) { /* -ENOENT, logic error */
4365
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4366
			extent_key.objectid, extent_key.offset);
4367 4368
		BUG();
	}
J
Josef Bacik 已提交
4369

4370
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4371
					  fs_info->nodesize);
4372 4373 4374 4375
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4376
				     struct btrfs_root *root, u64 owner,
4377 4378
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4379
{
4380
	struct btrfs_ref generic_ref = { 0 };
4381 4382
	int ret;

4383
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4384

4385 4386 4387
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4388
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4389 4390
	ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
					 ram_bytes, NULL, NULL);
4391 4392
	return ret;
}
4393 4394 4395 4396 4397 4398

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4399 4400 4401
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4402
{
4403
	struct btrfs_fs_info *fs_info = trans->fs_info;
4404
	int ret;
4405
	struct btrfs_block_group *block_group;
4406
	struct btrfs_space_info *space_info;
4407

4408 4409
	/*
	 * Mixed block groups will exclude before processing the log so we only
4410
	 * need to do the exclude dance if this fs isn't mixed.
4411
	 */
4412
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4413 4414
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4415
		if (ret)
4416
			return ret;
4417 4418
	}

4419
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4420 4421 4422
	if (!block_group)
		return -EINVAL;

4423 4424 4425 4426 4427 4428 4429 4430
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4431 4432
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4433 4434
	if (ret)
		btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
4435
	btrfs_put_block_group(block_group);
4436 4437 4438
	return ret;
}

4439 4440
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4441
		      u64 bytenr, int level, u64 owner)
4442
{
4443
	struct btrfs_fs_info *fs_info = root->fs_info;
4444 4445
	struct extent_buffer *buf;

4446
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
4447 4448 4449
	if (IS_ERR(buf))
		return buf;

4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4463
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
4464
	btrfs_tree_lock(buf);
4465
	btrfs_clean_tree_block(buf);
4466
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4467

4468
	btrfs_set_lock_blocking_write(buf);
4469
	set_extent_buffer_uptodate(buf);
4470

4471 4472 4473 4474 4475 4476
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4477
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4478
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4479
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4480
		buf->log_index = root->log_transid % 2;
4481 4482
		/*
		 * we allow two log transactions at a time, use different
4483
		 * EXTENT bit to differentiate dirty pages.
4484
		 */
4485
		if (buf->log_index == 0)
4486 4487 4488 4489
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4490
					buf->start + buf->len - 1);
4491
	} else {
4492
		buf->log_index = -1;
4493
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4494
			 buf->start + buf->len - 1, GFP_NOFS);
4495
	}
4496
	trans->dirty = true;
4497
	/* this returns a buffer locked for blocking */
4498 4499 4500
	return buf;
}

4501
/*
4502
 * finds a free extent and does all the dirty work required for allocation
4503
 * returns the tree buffer or an ERR_PTR on error.
4504
 */
4505
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4506 4507 4508 4509 4510
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
					     u64 empty_size)
4511
{
4512
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4513
	struct btrfs_key ins;
4514
	struct btrfs_block_rsv *block_rsv;
4515
	struct extent_buffer *buf;
4516
	struct btrfs_delayed_extent_op *extent_op;
4517
	struct btrfs_ref generic_ref = { 0 };
4518 4519
	u64 flags = 0;
	int ret;
4520 4521
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4522

4523
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4524
	if (btrfs_is_testing(fs_info)) {
4525
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4526
					    level, root_objectid);
4527 4528 4529 4530
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4531
#endif
4532

4533
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4534 4535 4536
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4537
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4538
				   empty_size, hint, &ins, 0, 0);
4539 4540
	if (ret)
		goto out_unuse;
4541

4542 4543
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
				    root_objectid);
4544 4545 4546 4547
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4548 4549 4550 4551 4552 4553 4554 4555 4556

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4557
		extent_op = btrfs_alloc_delayed_extent_op();
4558 4559 4560 4561
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4562 4563 4564 4565 4566
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4567 4568 4569
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4570
		extent_op->level = level;
4571

4572 4573 4574 4575
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
		generic_ref.real_root = root->root_key.objectid;
		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4576
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4577
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
4578
						 extent_op, NULL, NULL);
4579 4580
		if (ret)
			goto out_free_delayed;
4581
	}
4582
	return buf;
4583 4584 4585 4586 4587 4588

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
4589
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4590
out_unuse:
4591
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4592
	return ERR_PTR(ret);
4593
}
4594

4595 4596 4597 4598
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
4599 4600
	struct btrfs_key drop_progress;
	int drop_level;
4601 4602 4603 4604 4605
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
4606 4607
	int reada_slot;
	int reada_count;
4608
	int restarted;
4609 4610 4611 4612 4613
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
4614 4615 4616 4617
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
4618
{
4619
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4620 4621 4622
	u64 bytenr;
	u64 generation;
	u64 refs;
4623
	u64 flags;
4624
	u32 nritems;
Y
Yan, Zheng 已提交
4625 4626
	struct btrfs_key key;
	struct extent_buffer *eb;
4627
	int ret;
Y
Yan, Zheng 已提交
4628 4629
	int slot;
	int nread = 0;
4630

Y
Yan, Zheng 已提交
4631 4632 4633 4634 4635 4636
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
4637
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
4638
	}
4639

Y
Yan, Zheng 已提交
4640 4641
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
4642

Y
Yan, Zheng 已提交
4643 4644 4645
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
4646

C
Chris Mason 已提交
4647
		cond_resched();
Y
Yan, Zheng 已提交
4648 4649
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
4650

Y
Yan, Zheng 已提交
4651 4652
		if (slot == path->slots[wc->level])
			goto reada;
4653

Y
Yan, Zheng 已提交
4654 4655
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
4656 4657
			continue;

4658
		/* We don't lock the tree block, it's OK to be racy here */
4659
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4660 4661
					       wc->level - 1, 1, &refs,
					       &flags);
4662 4663 4664
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
4665 4666
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
4667 4668 4669
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
4670

4671 4672 4673
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
4674 4675 4676 4677 4678 4679 4680 4681
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
4682 4683 4684 4685
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
4686
		}
Y
Yan, Zheng 已提交
4687
reada:
4688
		readahead_tree_block(fs_info, bytenr);
Y
Yan, Zheng 已提交
4689
		nread++;
C
Chris Mason 已提交
4690
	}
Y
Yan, Zheng 已提交
4691
	wc->reada_slot = slot;
C
Chris Mason 已提交
4692
}
4693

Y
Yan Zheng 已提交
4694
/*
L
Liu Bo 已提交
4695
 * helper to process tree block while walking down the tree.
4696 4697 4698 4699 4700
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
4701
 */
4702
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4703
				   struct btrfs_root *root,
4704
				   struct btrfs_path *path,
4705
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
4706
{
4707
	struct btrfs_fs_info *fs_info = root->fs_info;
4708 4709 4710
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
4711 4712
	int ret;

4713 4714 4715
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
4716

4717 4718 4719 4720
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
4721 4722 4723
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4724
		BUG_ON(!path->locks[level]);
4725
		ret = btrfs_lookup_extent_info(trans, fs_info,
4726
					       eb->start, level, 1,
4727 4728
					       &wc->refs[level],
					       &wc->flags[level]);
4729 4730 4731
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
4732 4733
		BUG_ON(wc->refs[level] == 0);
	}
4734

4735 4736 4737
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
4738

4739
		if (path->locks[level] && !wc->keep_locks) {
4740
			btrfs_tree_unlock_rw(eb, path->locks[level]);
4741 4742 4743 4744
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
4745

4746 4747 4748
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
4749
		ret = btrfs_inc_ref(trans, root, eb, 1);
4750
		BUG_ON(ret); /* -ENOMEM */
4751
		ret = btrfs_dec_ref(trans, root, eb, 0);
4752
		BUG_ON(ret); /* -ENOMEM */
4753
		ret = btrfs_set_disk_extent_flags(trans, eb->start,
4754 4755
						  eb->len, flag,
						  btrfs_header_level(eb), 0);
4756
		BUG_ON(ret); /* -ENOMEM */
4757 4758 4759 4760 4761 4762 4763 4764
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
4765
		btrfs_tree_unlock_rw(eb, path->locks[level]);
4766 4767 4768 4769 4770
		path->locks[level] = 0;
	}
	return 0;
}

4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
4798
/*
L
Liu Bo 已提交
4799
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
4814
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
4815
{
4816
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4817 4818 4819 4820
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
4821
	struct btrfs_key first_key;
4822
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
4823 4824 4825 4826
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
4827
	bool need_account = false;
Y
Yan, Zheng 已提交
4828 4829 4830 4831 4832 4833 4834 4835 4836

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
4837 4838
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4839
		return 1;
4840
	}
Y
Yan, Zheng 已提交
4841 4842

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4843 4844
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
4845

4846
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
4847
	if (!next) {
4848
		next = btrfs_find_create_tree_block(fs_info, bytenr);
4849 4850 4851
		if (IS_ERR(next))
			return PTR_ERR(next);

4852 4853
		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
					       level - 1);
Y
Yan, Zheng 已提交
4854 4855 4856
		reada = 1;
	}
	btrfs_tree_lock(next);
4857
	btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
4858

4859
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
4860 4861
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
4862 4863
	if (ret < 0)
		goto out_unlock;
4864

4865
	if (unlikely(wc->refs[level - 1] == 0)) {
4866
		btrfs_err(fs_info, "Missing references.");
4867 4868
		ret = -EIO;
		goto out_unlock;
4869
	}
4870
	*lookup_info = 0;
Y
Yan, Zheng 已提交
4871

4872
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
4873
		if (wc->refs[level - 1] > 1) {
4874
			need_account = true;
4875 4876 4877 4878
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
4892 4893 4894 4895
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
4896 4897
	}

4898
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
4899 4900 4901
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
4902
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4903 4904 4905 4906 4907
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
4908 4909
		next = read_tree_block(fs_info, bytenr, generation, level - 1,
				       &first_key);
4910 4911 4912
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
4913
			free_extent_buffer(next);
4914
			return -EIO;
4915
		}
Y
Yan, Zheng 已提交
4916
		btrfs_tree_lock(next);
4917
		btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
4918 4919 4920
	}

	level--;
4921 4922 4923 4924 4925 4926
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
4927 4928
	path->nodes[level] = next;
	path->slots[level] = 0;
4929
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
Y
Yan, Zheng 已提交
4930 4931 4932 4933 4934 4935 4936
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
4937 4938 4939 4940
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
4941
			ASSERT(root->root_key.objectid ==
4942
			       btrfs_header_owner(path->nodes[level]));
4943 4944 4945 4946 4947 4948 4949
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
4950 4951
			parent = 0;
		}
Y
Yan, Zheng 已提交
4952

4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

4970 4971 4972 4973 4974 4975 4976
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
4977
			ret = btrfs_qgroup_trace_subtree(trans, next,
4978
							 generation, level - 1);
4979
			if (ret) {
4980
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
4981 4982
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
4983 4984
			}
		}
4985 4986 4987 4988 4989 4990 4991 4992 4993 4994

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

4995 4996 4997 4998
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
		ret = btrfs_free_extent(trans, &ref);
4999 5000
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
5001
	}
5002
no_delete:
5003 5004 5005 5006
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
5007 5008
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
5009 5010

	return ret;
Y
Yan, Zheng 已提交
5011 5012
}

5013
/*
L
Liu Bo 已提交
5014
 * helper to process tree block while walking up the tree.
5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
5030
	struct btrfs_fs_info *fs_info = root->fs_info;
5031
	int ret;
5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5057
			btrfs_set_lock_blocking_write(eb);
5058
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5059

5060
			ret = btrfs_lookup_extent_info(trans, fs_info,
5061
						       eb->start, level, 1,
5062 5063
						       &wc->refs[level],
						       &wc->flags[level]);
5064 5065
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5066
				path->locks[level] = 0;
5067 5068
				return ret;
			}
5069 5070
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5071
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5072
				path->locks[level] = 0;
5073 5074
				return 1;
			}
Y
Yan Zheng 已提交
5075
		}
5076
	}
Y
Yan Zheng 已提交
5077

5078 5079
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5080

5081 5082 5083
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5084
				ret = btrfs_dec_ref(trans, root, eb, 1);
5085
			else
5086
				ret = btrfs_dec_ref(trans, root, eb, 0);
5087
			BUG_ON(ret); /* -ENOMEM */
5088 5089 5090 5091 5092
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5093
					     ret);
5094
				}
5095
			}
5096
		}
5097
		/* make block locked assertion in btrfs_clean_tree_block happy */
5098 5099 5100
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5101
			btrfs_set_lock_blocking_write(eb);
5102
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5103
		}
5104
		btrfs_clean_tree_block(eb);
5105 5106 5107 5108 5109
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5110 5111
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5112 5113 5114
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5115 5116 5117
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5118 5119
	}

5120
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5121 5122 5123
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5124
	return 0;
5125 5126 5127 5128 5129

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5130 5131 5132 5133 5134 5135 5136 5137
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5138
	int lookup_info = 1;
5139 5140 5141
	int ret;

	while (level >= 0) {
5142
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5143 5144 5145 5146 5147 5148
		if (ret > 0)
			break;

		if (level == 0)
			break;

5149 5150 5151 5152
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5153
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5154 5155 5156
		if (ret > 0) {
			path->slots[level]++;
			continue;
5157 5158
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5159
		level = wc->level;
Y
Yan Zheng 已提交
5160 5161 5162 5163
	}
	return 0;
}

C
Chris Mason 已提交
5164
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5165
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5166
				 struct btrfs_path *path,
5167
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5168
{
5169
	int level = wc->level;
C
Chris Mason 已提交
5170
	int ret;
5171

5172 5173 5174 5175 5176 5177
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5178 5179
			return 0;
		} else {
5180 5181 5182
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5183 5184
			if (ret < 0)
				return ret;
5185

5186
			if (path->locks[level]) {
5187 5188
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5189
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5190
			}
5191 5192 5193
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5194 5195 5196 5197 5198
		}
	}
	return 1;
}

C
Chris Mason 已提交
5199
/*
5200 5201 5202 5203 5204 5205 5206 5207 5208
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5209 5210
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5211
 */
5212
int btrfs_drop_snapshot(struct btrfs_root *root,
A
Arne Jansen 已提交
5213 5214
			 struct btrfs_block_rsv *block_rsv, int update_ref,
			 int for_reloc)
C
Chris Mason 已提交
5215
{
5216
	struct btrfs_fs_info *fs_info = root->fs_info;
5217
	struct btrfs_path *path;
5218
	struct btrfs_trans_handle *trans;
5219
	struct btrfs_root *tree_root = fs_info->tree_root;
5220
	struct btrfs_root_item *root_item = &root->root_item;
5221 5222 5223 5224 5225
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5226
	bool root_dropped = false;
C
Chris Mason 已提交
5227

5228
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5229

5230
	path = btrfs_alloc_path();
5231 5232 5233 5234
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5235

5236
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5237 5238
	if (!wc) {
		btrfs_free_path(path);
5239 5240
		err = -ENOMEM;
		goto out;
5241
	}
5242

5243
	trans = btrfs_start_transaction(tree_root, 0);
5244 5245 5246 5247
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5248

5249 5250 5251 5252
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5253 5254
	if (block_rsv)
		trans->block_rsv = block_rsv;
5255

5256 5257 5258 5259 5260 5261 5262 5263 5264
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5265
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5266
		level = btrfs_header_level(root->node);
5267
		path->nodes[level] = btrfs_lock_root_node(root);
5268
		btrfs_set_lock_blocking_write(path->nodes[level]);
5269
		path->slots[level] = 0;
5270
		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5271 5272
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5273 5274
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5275 5276 5277
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5278
		level = root_item->drop_level;
5279
		BUG_ON(level == 0);
5280
		path->lowest_level = level;
5281 5282 5283 5284
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5285
			goto out_end_trans;
5286
		}
Y
Yan, Zheng 已提交
5287
		WARN_ON(ret > 0);
5288

5289 5290 5291 5292
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5293
		btrfs_unlock_up_safe(path, 0);
5294 5295 5296 5297

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5298
			btrfs_set_lock_blocking_write(path->nodes[level]);
5299
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5300

5301
			ret = btrfs_lookup_extent_info(trans, fs_info,
5302
						path->nodes[level]->start,
5303
						level, 1, &wc->refs[level],
5304
						&wc->flags[level]);
5305 5306 5307 5308
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5309 5310 5311 5312 5313 5314
			BUG_ON(wc->refs[level] == 0);

			if (level == root_item->drop_level)
				break;

			btrfs_tree_unlock(path->nodes[level]);
5315
			path->locks[level] = 0;
5316 5317 5318
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5319
	}
5320

5321
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5322 5323 5324 5325 5326
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5327
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5328

C
Chris Mason 已提交
5329
	while (1) {
D
David Sterba 已提交
5330

5331 5332 5333
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5334
			break;
5335
		}
C
Chris Mason 已提交
5336

5337 5338 5339
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5340
			break;
5341 5342 5343 5344
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5345 5346
			break;
		}
5347 5348

		if (wc->stage == DROP_REFERENCE) {
5349 5350 5351 5352 5353 5354 5355 5356
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
		root_item->drop_level = wc->drop_level;
5357 5358

		BUG_ON(wc->level == 0);
5359
		if (btrfs_should_end_transaction(trans) ||
5360
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5361 5362 5363
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5364
			if (ret) {
5365
				btrfs_abort_transaction(trans, ret);
5366 5367 5368
				err = ret;
				goto out_end_trans;
			}
5369

5370
			btrfs_end_transaction_throttle(trans);
5371
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5372 5373
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5374 5375 5376 5377
				err = -EAGAIN;
				goto out_free;
			}

5378
			trans = btrfs_start_transaction(tree_root, 0);
5379 5380 5381 5382
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5383 5384
			if (block_rsv)
				trans->block_rsv = block_rsv;
5385
		}
C
Chris Mason 已提交
5386
	}
5387
	btrfs_release_path(path);
5388 5389
	if (err)
		goto out_end_trans;
5390

5391
	ret = btrfs_del_root(trans, &root->root_key);
5392
	if (ret) {
5393
		btrfs_abort_transaction(trans, ret);
5394
		err = ret;
5395 5396
		goto out_end_trans;
	}
5397

5398
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5399 5400
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5401
		if (ret < 0) {
5402
			btrfs_abort_transaction(trans, ret);
5403 5404 5405
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5406 5407 5408 5409 5410 5411 5412
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5413 5414 5415
		}
	}

5416
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5417
		btrfs_add_dropped_root(trans, root);
5418 5419 5420
	} else {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
5421
		btrfs_put_fs_root(root);
5422
	}
5423
	root_dropped = true;
5424
out_end_trans:
5425
	btrfs_end_transaction_throttle(trans);
5426
out_free:
5427
	kfree(wc);
5428
	btrfs_free_path(path);
5429
out:
5430 5431 5432 5433 5434 5435 5436
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5437
	if (!for_reloc && !root_dropped)
5438
		btrfs_add_dead_root(root);
5439
	if (err && err != -EAGAIN)
5440
		btrfs_handle_fs_error(fs_info, err, NULL);
5441
	return err;
C
Chris Mason 已提交
5442
}
C
Chris Mason 已提交
5443

5444 5445 5446 5447
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5448
 * only used by relocation code
5449
 */
Y
Yan Zheng 已提交
5450 5451 5452 5453 5454
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5455
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5456
	struct btrfs_path *path;
5457
	struct walk_control *wc;
Y
Yan Zheng 已提交
5458 5459 5460 5461 5462
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5463 5464
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5465
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5466 5467
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5468

5469
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5470 5471 5472 5473
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5474

5475
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
5476
	parent_level = btrfs_header_level(parent);
D
David Sterba 已提交
5477
	atomic_inc(&parent->refs);
Y
Yan Zheng 已提交
5478 5479 5480
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5481
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
5482 5483 5484
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5485
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5486 5487 5488 5489 5490 5491 5492 5493

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5494
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5495 5496

	while (1) {
5497 5498
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5499 5500
			ret = wret;
			break;
5501
		}
Y
Yan Zheng 已提交
5502

5503
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5504 5505 5506 5507 5508 5509
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5510
	kfree(wc);
Y
Yan Zheng 已提交
5511 5512 5513 5514
	btrfs_free_path(path);
	return ret;
}

5515 5516
/*
 * helper to account the unused space of all the readonly block group in the
5517
 * space_info. takes mirrors into account.
5518
 */
5519
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5520
{
5521
	struct btrfs_block_group *block_group;
5522 5523 5524
	u64 free_bytes = 0;
	int factor;

5525
	/* It's df, we don't care if it's racy */
5526 5527 5528 5529 5530
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5531 5532 5533 5534 5535 5536 5537
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5538
		factor = btrfs_bg_type_to_factor(block_group->flags);
5539
		free_bytes += (block_group->length -
5540
			       block_group->used) * factor;
5541 5542 5543 5544 5545 5546 5547 5548

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5549 5550
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5551
{
5552
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5553 5554
}

5555 5556 5557 5558 5559 5560 5561 5562 5563
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5564 5565
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5566 5567 5568 5569 5570
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5571 5572 5573
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5574
 */
5575
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5576
{
5577
	u64 start = SZ_1M, len = 0, end = 0;
5578 5579 5580 5581
	int ret;

	*trimmed = 0;

5582 5583 5584 5585
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5586
	/* Not writable = nothing to do. */
5587
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5588 5589 5590 5591 5592 5593 5594 5595 5596
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
5597
		struct btrfs_fs_info *fs_info = device->fs_info;
5598 5599 5600 5601
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
5602
			break;
5603

5604 5605 5606
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5607 5608 5609 5610

		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

5611 5612 5613 5614 5615 5616
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
5617

5618
		len = end - start + 1;
5619

5620 5621
		/* We didn't find any extents */
		if (!len) {
5622
			mutex_unlock(&fs_info->chunk_mutex);
5623
			ret = 0;
5624 5625 5626
			break;
		}

5627 5628 5629 5630 5631 5632
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

5652 5653 5654 5655 5656 5657 5658 5659 5660
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
5661
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5662
{
5663
	struct btrfs_block_group *cache = NULL;
5664 5665
	struct btrfs_device *device;
	struct list_head *devices;
5666
	u64 group_trimmed;
5667
	u64 range_end = U64_MAX;
5668 5669 5670
	u64 start;
	u64 end;
	u64 trimmed = 0;
5671 5672 5673 5674
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
5675 5676
	int ret = 0;

5677 5678 5679 5680 5681 5682 5683 5684
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

5685
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5686
	for (; cache; cache = btrfs_next_block_group(cache)) {
5687
		if (cache->start >= range_end) {
5688 5689 5690 5691
			btrfs_put_block_group(cache);
			break;
		}

5692 5693
		start = max(range->start, cache->start);
		end = min(range_end, cache->start + cache->length);
5694 5695

		if (end - start >= range->minlen) {
5696
			if (!btrfs_block_group_done(cache)) {
5697
				ret = btrfs_cache_block_group(cache, 0);
5698
				if (ret) {
5699 5700 5701
					bg_failed++;
					bg_ret = ret;
					continue;
5702
				}
5703
				ret = btrfs_wait_block_group_cache_done(cache);
5704
				if (ret) {
5705 5706 5707
					bg_failed++;
					bg_ret = ret;
					continue;
5708
				}
5709 5710 5711 5712 5713 5714 5715 5716 5717
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
5718 5719 5720
				bg_failed++;
				bg_ret = ret;
				continue;
5721 5722 5723 5724
			}
		}
	}

5725 5726 5727 5728
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
5729
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5730 5731
	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
5732
		ret = btrfs_trim_free_extents(device, &group_trimmed);
5733 5734 5735
		if (ret) {
			dev_failed++;
			dev_ret = ret;
5736
			break;
5737
		}
5738 5739 5740

		trimmed += group_trimmed;
	}
5741
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5742

5743 5744 5745 5746
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
5747
	range->len = trimmed;
5748 5749 5750
	if (bg_ret)
		return bg_ret;
	return dev_ret;
5751
}
5752 5753

/*
5754
 * btrfs_{start,end}_write_no_snapshotting() are similar to
5755 5756 5757
 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
 * data into the page cache through nocow before the subvolume is snapshoted,
 * but flush the data into disk after the snapshot creation, or to prevent
5758
 * operations while snapshotting is ongoing and that cause the snapshot to be
5759
 * inconsistent (writes followed by expanding truncates for example).
5760
 */
5761
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
5762 5763
{
	percpu_counter_dec(&root->subv_writers->counter);
5764
	cond_wake_up(&root->subv_writers->wait);
5765 5766
}

5767
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
5768
{
5769
	if (atomic_read(&root->will_be_snapshotted))
5770 5771 5772 5773 5774 5775 5776
		return 0;

	percpu_counter_inc(&root->subv_writers->counter);
	/*
	 * Make sure counter is updated before we check for snapshot creation.
	 */
	smp_mb();
5777 5778
	if (atomic_read(&root->will_be_snapshotted)) {
		btrfs_end_write_no_snapshotting(root);
5779 5780 5781 5782
		return 0;
	}
	return 1;
}
5783 5784 5785 5786 5787 5788

void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
	while (true) {
		int ret;

5789
		ret = btrfs_start_write_no_snapshotting(root);
5790 5791
		if (ret)
			break;
5792 5793
		wait_var_event(&root->will_be_snapshotted,
			       !atomic_read(&root->will_be_snapshotted));
5794 5795
	}
}