extent-tree.c 154.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "misc.h"
20
#include "tree-log.h"
21 22
#include "disk-io.h"
#include "print-tree.h"
23
#include "volumes.h"
D
David Woodhouse 已提交
24
#include "raid56.h"
25
#include "locking.h"
26
#include "free-space-cache.h"
27
#include "free-space-tree.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35

36 37
#undef SCRAMBLE_DELAYED_REFS

38

39
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
40 41 42 43
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
44 45 46 47 48 49 50 51
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
52
				     struct btrfs_delayed_ref_node *node,
53
				     struct btrfs_delayed_extent_op *extent_op);
54 55
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
56

J
Josef Bacik 已提交
57 58 59 60 61
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
{
	return (cache->flags & bits) == bits;
}

62 63
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
64
{
65
	u64 end = start + num_bytes - 1;
66
	set_extent_bits(&fs_info->freed_extents[0],
67
			start, end, EXTENT_UPTODATE);
68
	set_extent_bits(&fs_info->freed_extents[1],
69
			start, end, EXTENT_UPTODATE);
70 71
	return 0;
}
J
Josef Bacik 已提交
72

73
void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
74
{
75
	struct btrfs_fs_info *fs_info = cache->fs_info;
76
	u64 start, end;
J
Josef Bacik 已提交
77

78 79 80
	start = cache->key.objectid;
	end = start + cache->key.offset - 1;

81
	clear_extent_bits(&fs_info->freed_extents[0],
82
			  start, end, EXTENT_UPTODATE);
83
	clear_extent_bits(&fs_info->freed_extents[1],
84
			  start, end, EXTENT_UPTODATE);
J
Josef Bacik 已提交
85 86
}

87
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
88
{
89 90
	if (ref->type == BTRFS_REF_METADATA) {
		if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
91
			return BTRFS_BLOCK_GROUP_SYSTEM;
92
		else
93
			return BTRFS_BLOCK_GROUP_METADATA;
94
	}
95 96 97 98 99 100 101 102 103
	return BTRFS_BLOCK_GROUP_DATA;
}

static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);

104
	space_info = btrfs_find_space_info(fs_info, flags);
105 106 107 108 109 110 111 112 113 114
	ASSERT(space_info);
	percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
}

static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);
115

116
	space_info = btrfs_find_space_info(fs_info, flags);
117
	ASSERT(space_info);
118
	percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
119
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
120 121
}

122
/* simple helper to search for an existing data extent at a given offset */
123
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
124 125 126
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
127
	struct btrfs_path *path;
128

Z
Zheng Yan 已提交
129
	path = btrfs_alloc_path();
130 131 132
	if (!path)
		return -ENOMEM;

133 134
	key.objectid = start;
	key.offset = len;
135
	key.type = BTRFS_EXTENT_ITEM_KEY;
136
	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
Z
Zheng Yan 已提交
137
	btrfs_free_path(path);
138 139 140
	return ret;
}

141
/*
142
 * helper function to lookup reference count and flags of a tree block.
143 144 145 146 147 148 149 150
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
151
			     struct btrfs_fs_info *fs_info, u64 bytenr,
152
			     u64 offset, int metadata, u64 *refs, u64 *flags)
153 154 155 156 157 158 159 160 161 162 163 164
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

165 166 167 168
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
169 170
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
171 172 173
		metadata = 0;
	}

174 175 176 177 178 179 180 181
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
182 183 184 185 186 187 188 189 190

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

191
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
192 193 194
	if (ret < 0)
		goto out_free;

195
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
196 197 198 199 200 201
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
202
			    key.offset == fs_info->nodesize)
203 204
				ret = 0;
		}
205 206
	}

207 208 209 210 211 212 213 214 215
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
216 217 218 219 220 221 222 223
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
224
		}
225

226 227 228 229 230 231 232 233 234 235 236 237
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
238
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
239 240
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
241
			refcount_inc(&head->refs);
242 243
			spin_unlock(&delayed_refs->lock);

244
			btrfs_release_path(path);
245

246 247 248 249
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
250 251
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
252
			btrfs_put_delayed_ref_head(head);
253
			goto search_again;
254
		}
255
		spin_lock(&head->lock);
256 257 258 259 260
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

261
		num_refs += head->ref_mod;
262
		spin_unlock(&head->lock);
263 264 265 266 267 268 269 270 271 272 273 274 275 276
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

277 278 279 280 281 282 283 284 285 286 287 288 289 290
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
309
 * When a tree block is COWed through a tree, there are four cases:
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
336 337 338
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
339
 * - different files inside a single subvolume
340 341
 * - different offsets inside a file (bookend extents in file.c)
 *
342
 * The extent ref structure for the implicit back refs has fields for:
343 344 345
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
346 347
 * - original offset in the file
 * - how many bookend extents
348
 *
349 350
 * The key offset for the implicit back refs is hash of the first
 * three fields.
351
 *
352
 * The extent ref structure for the full back refs has field for:
353
 *
354
 * - number of pointers in the tree leaf
355
 *
356 357
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
358
 *
359 360
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
361
 *
362
 *     (root_key.objectid, inode objectid, offset in file, 1)
363
 *
364 365
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
366
 *
367
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
368
 *
369
 * Btree extents can be referenced by:
370
 *
371
 * - Different subvolumes
372
 *
373 374 375 376
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
377
 *
378 379 380
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
381
 */
Z
Zheng Yan 已提交
382

383 384
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
385
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
386 387 388 389 390 391 392
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
393
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
394 395 396 397 398 399

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
400
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
401
				return type;
402 403 404 405 406 407 408 409 410 411 412
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
413
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
414
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
415
				return type;
416 417 418 419 420 421 422 423 424 425 426
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
427 428 429 430 431 432 433 434 435 436 437 438 439 440
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
		  eb->start, type);
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

441
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
442 443 444 445 446 447
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
448
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
449
	lenum = cpu_to_le64(owner);
450
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
451
	lenum = cpu_to_le64(offset);
452
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
482
	struct btrfs_root *root = trans->fs_info->extent_root;
483 484
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
485
	struct extent_buffer *leaf;
486
	u32 nritems;
487
	int ret;
488 489
	int recow;
	int err = -ENOENT;
490

Z
Zheng Yan 已提交
491
	key.objectid = bytenr;
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
507

508 509 510 511
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
512 513 514
	}

	leaf = path->nodes[0];
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
540
				btrfs_release_path(path);
541 542 543 544 545 546
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
547
	}
548 549
fail:
	return err;
Z
Zheng Yan 已提交
550 551
}

552 553 554 555 556
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
557
{
558
	struct btrfs_root *root = trans->fs_info->extent_root;
Z
Zheng Yan 已提交
559 560
	struct btrfs_key key;
	struct extent_buffer *leaf;
561
	u32 size;
Z
Zheng Yan 已提交
562 563
	u32 num_refs;
	int ret;
564 565

	key.objectid = bytenr;
566 567 568 569 570 571 572 573 574 575
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
576

577 578 579 580 581 582 583
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
584
		ref = btrfs_item_ptr(leaf, path->slots[0],
585 586 587 588 589 590 591
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
592
		}
593 594 595 596 597 598 599 600
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
601
			btrfs_release_path(path);
602 603 604 605 606
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
607

608 609 610 611 612 613 614 615 616 617 618 619 620 621
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
622 623
		}
	}
624 625 626
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
627
	btrfs_release_path(path);
628
	return ret;
629 630
}

631 632
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
633
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
634
{
635 636 637
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
638
	struct extent_buffer *leaf;
639
	u32 num_refs = 0;
Z
Zheng Yan 已提交
640 641 642
	int ret = 0;

	leaf = path->nodes[0];
643 644 645 646 647 648 649 650 651 652
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
653
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
654 655 656
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
657 658 659 660
	} else {
		BUG();
	}

661 662
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
663

Z
Zheng Yan 已提交
664
	if (num_refs == 0) {
665
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
J
Josef Bacik 已提交
666
		*last_ref = 1;
Z
Zheng Yan 已提交
667
	} else {
668 669 670 671
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
672 673 674 675 676
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

677
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
678
					  struct btrfs_extent_inline_ref *iref)
679
{
680 681 682 683 684
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
685
	int type;
686 687 688

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
689 690

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
691
	if (iref) {
692 693 694 695 696 697 698
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
718

719 720 721 722
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
723
{
724
	struct btrfs_root *root = trans->fs_info->extent_root;
725
	struct btrfs_key key;
726 727
	int ret;

728 729 730 731 732 733 734
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
735 736
	}

737 738 739 740
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
741 742
}

743 744 745 746
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
747
{
748
	struct btrfs_key key;
Z
Zheng Yan 已提交
749 750
	int ret;

751 752 753 754 755 756 757 758 759
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

760
	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
761
				      path, &key, 0);
762
	btrfs_release_path(path);
Z
Zheng Yan 已提交
763 764 765
	return ret;
}

766
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
767
{
768 769 770 771 772 773 774 775 776 777 778 779 780
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
781
}
782

783 784
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
785

C
Chris Mason 已提交
786
{
787
	for (; level < BTRFS_MAX_LEVEL; level++) {
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
803

804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
825
	struct btrfs_fs_info *fs_info = trans->fs_info;
826
	struct btrfs_root *root = fs_info->extent_root;
827 828 829 830 831 832 833 834 835 836 837 838 839
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
840
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
841
	int needed;
842

843
	key.objectid = bytenr;
Z
Zheng Yan 已提交
844
	key.type = BTRFS_EXTENT_ITEM_KEY;
845
	key.offset = num_bytes;
Z
Zheng Yan 已提交
846

847 848 849
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
850
		path->keep_locks = 1;
851 852
	} else
		extra_size = -1;
853 854

	/*
855 856
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
857 858 859 860 861 862 863
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
864
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
865
	if (ret < 0) {
866 867 868
		err = ret;
		goto out;
	}
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
886
			key.objectid = bytenr;
887 888 889 890 891 892 893
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

894 895 896
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
897
	} else if (WARN_ON(ret)) {
898 899
		err = -EIO;
		goto out;
900
	}
901 902 903

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
904
	if (unlikely(item_size < sizeof(*ei))) {
905 906 907 908 909
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
910 911 912 913 914 915 916

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

917
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
918 919 920 921
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

922 923 924 925 926
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

927 928 929 930 931 932 933
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
934 935
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
936
			err = -EUCLEAN;
937 938 939
			goto out;
		}

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
991 992
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
993
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
994 995 996 997 998 999
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
1000
	if (insert) {
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
		path->keep_locks = 0;
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
1011
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1012 1013 1014 1015 1016
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1034
	btrfs_extend_item(path, size);
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1079 1080 1081
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1082
	if (ret != -ENOENT)
1083
		return ret;
1084

1085
	btrfs_release_path(path);
1086 1087 1088
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1089 1090
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1091
	} else {
1092 1093
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1094
	}
1095 1096
	return ret;
}
Z
Zheng Yan 已提交
1097

1098 1099 1100 1101
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1102
void update_inline_extent_backref(struct btrfs_path *path,
1103 1104
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1105 1106
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1107
{
1108
	struct extent_buffer *leaf = path->nodes[0];
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1127 1128 1129 1130 1131 1132
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1143
	}
Z
Zheng Yan 已提交
1144

1145 1146 1147 1148 1149 1150 1151 1152 1153
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1154
		*last_ref = 1;
1155 1156 1157 1158 1159 1160 1161 1162
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1163
		btrfs_truncate_item(path, item_size, 1);
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1179 1180 1181
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1182 1183
	if (ret == 0) {
		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1184 1185
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1186
	} else if (ret == -ENOENT) {
1187
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1188 1189 1190
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1191
	}
1192 1193
	return ret;
}
Z
Zheng Yan 已提交
1194

1195 1196 1197 1198 1199 1200 1201 1202
static int insert_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add)
{
	int ret;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
1203 1204
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1205
	} else {
1206 1207 1208
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
1209 1210 1211
	}
	return ret;
}
1212

1213 1214 1215
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1216
				 int refs_to_drop, int is_data, int *last_ref)
1217
{
1218
	int ret = 0;
1219

1220 1221
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1222 1223
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1224
	} else if (is_data) {
1225
		ret = remove_extent_data_ref(trans, path, refs_to_drop,
J
Josef Bacik 已提交
1226
					     last_ref);
1227
	} else {
J
Josef Bacik 已提交
1228
		*last_ref = 1;
1229
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1230 1231 1232 1233
	}
	return ret;
}

1234 1235
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1236
{
1237 1238
	int j, ret = 0;
	u64 bytes_left, end;
1239
	u64 aligned_start = ALIGN(start, 1 << 9);
1240

1241 1242 1243 1244 1245
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1246

1247
	*discarded_bytes = 0;
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1299 1300
					   GFP_NOFS, 0);
		if (!ret)
1301
			*discarded_bytes += bytes_left;
1302
	}
1303
	return ret;
1304 1305
}

1306
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1307
			 u64 num_bytes, u64 *actual_bytes)
1308 1309
{
	int ret;
1310
	u64 discarded_bytes = 0;
1311
	struct btrfs_bio *bbio = NULL;
1312

C
Christoph Hellwig 已提交
1313

1314 1315 1316 1317
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
1318
	btrfs_bio_counter_inc_blocked(fs_info);
1319
	/* Tell the block device(s) that the sectors can be discarded */
1320 1321
	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
			      &bbio, 0);
1322
	/* Error condition is -ENOMEM */
1323
	if (!ret) {
1324
		struct btrfs_bio_stripe *stripe = bbio->stripes;
1325 1326 1327
		int i;


1328
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1329
			u64 bytes;
1330 1331
			struct request_queue *req_q;

1332 1333 1334 1335
			if (!stripe->dev->bdev) {
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1336 1337
			req_q = bdev_get_queue(stripe->dev->bdev);
			if (!blk_queue_discard(req_q))
1338 1339
				continue;

1340 1341
			ret = btrfs_issue_discard(stripe->dev->bdev,
						  stripe->physical,
1342 1343
						  stripe->length,
						  &bytes);
1344
			if (!ret)
1345
				discarded_bytes += bytes;
1346
			else if (ret != -EOPNOTSUPP)
1347
				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1348 1349 1350 1351 1352 1353 1354

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1355
		}
1356
		btrfs_put_bbio(bbio);
1357
	}
1358
	btrfs_bio_counter_dec(fs_info);
1359 1360 1361 1362

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1363

D
David Woodhouse 已提交
1364 1365
	if (ret == -EOPNOTSUPP)
		ret = 0;
1366 1367 1368
	return ret;
}

1369
/* Can return -ENOMEM */
1370
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1371
			 struct btrfs_ref *generic_ref)
1372
{
1373
	struct btrfs_fs_info *fs_info = trans->fs_info;
1374
	int old_ref_mod, new_ref_mod;
1375
	int ret;
A
Arne Jansen 已提交
1376

1377 1378 1379 1380
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1381

1382 1383
	if (generic_ref->type == BTRFS_REF_METADATA)
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1384
				NULL, &old_ref_mod, &new_ref_mod);
1385 1386
	else
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1387 1388
						 &old_ref_mod, &new_ref_mod);

1389
	btrfs_ref_tree_mod(fs_info, generic_ref);
1390

1391
	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1392
		sub_pinned_bytes(fs_info, generic_ref);
1393

1394 1395 1396
	return ret;
}

1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1431
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1432
				  struct btrfs_delayed_ref_node *node,
1433 1434 1435 1436 1437 1438 1439
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1440
	struct btrfs_key key;
1441 1442
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1443 1444 1445 1446 1447 1448 1449
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1450
	path->reada = READA_FORWARD;
1451 1452
	path->leave_spinning = 1;
	/* this will setup the path even if it fails to insert the back ref */
1453 1454 1455
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1456
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1457
		goto out;
J
Josef Bacik 已提交
1458 1459 1460 1461 1462 1463

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1464
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1465
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1466 1467 1468 1469 1470
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1471

1472
	btrfs_mark_buffer_dirty(leaf);
1473
	btrfs_release_path(path);
1474

1475
	path->reada = READA_FORWARD;
1476
	path->leave_spinning = 1;
1477
	/* now insert the actual backref */
1478 1479
	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
				    owner, offset, refs_to_add);
1480
	if (ret)
1481
		btrfs_abort_transaction(trans, ret);
1482
out:
1483
	btrfs_free_path(path);
1484
	return ret;
1485 1486
}

1487 1488 1489 1490
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1491
{
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1504
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1505

1506 1507
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1508
	ref_root = ref->root;
1509 1510

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1511
		if (extent_op)
1512
			flags |= extent_op->flags_to_set;
1513 1514 1515 1516
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1517
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1518 1519 1520
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1521
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1522
		ret = __btrfs_free_extent(trans, node, parent,
1523 1524
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1525
					  extent_op);
1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1551
				 struct btrfs_delayed_ref_head *head,
1552 1553
				 struct btrfs_delayed_extent_op *extent_op)
{
1554
	struct btrfs_fs_info *fs_info = trans->fs_info;
1555 1556 1557 1558 1559
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1560
	int ret;
1561
	int err = 0;
1562
	int metadata = !extent_op->is_data;
1563

1564 1565 1566
	if (trans->aborted)
		return 0;

1567
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1568 1569
		metadata = 0;

1570 1571 1572 1573
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1574
	key.objectid = head->bytenr;
1575

1576 1577
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1578
		key.offset = extent_op->level;
1579 1580
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1581
		key.offset = head->num_bytes;
1582 1583 1584
	}

again:
1585
	path->reada = READA_FORWARD;
1586
	path->leave_spinning = 1;
1587
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1588 1589 1590 1591 1592
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1593
		if (metadata) {
1594 1595 1596 1597
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1598
				if (key.objectid == head->bytenr &&
1599
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1600
				    key.offset == head->num_bytes)
1601 1602 1603 1604 1605
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1606

1607 1608
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1609 1610 1611 1612 1613 1614
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1615
		}
1616 1617 1618 1619
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1620

1621
	if (unlikely(item_size < sizeof(*ei))) {
1622 1623 1624 1625 1626 1627
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1628 1629
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1630

1631 1632 1633 1634
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1635 1636
}

1637 1638 1639 1640
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1641 1642
{
	int ret = 0;
1643 1644 1645
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1646

1647
	ref = btrfs_delayed_node_to_tree_ref(node);
1648
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1649

1650 1651
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1652
	ref_root = ref->root;
1653

1654
	if (node->ref_mod != 1) {
1655
		btrfs_err(trans->fs_info,
1656 1657 1658 1659 1660
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1661
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1662
		BUG_ON(!extent_op || !extent_op->update_flags);
1663
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1664
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1665 1666
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1667
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1668
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1669
					  ref->level, 0, 1, extent_op);
1670 1671 1672
	} else {
		BUG();
	}
1673 1674 1675 1676
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1677 1678 1679 1680
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1681
{
1682 1683
	int ret = 0;

1684 1685
	if (trans->aborted) {
		if (insert_reserved)
1686
			btrfs_pin_extent(trans->fs_info, node->bytenr,
1687
					 node->num_bytes, 1);
1688
		return 0;
1689
	}
1690

1691 1692
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1693
		ret = run_delayed_tree_ref(trans, node, extent_op,
1694 1695 1696
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1697
		ret = run_delayed_data_ref(trans, node, extent_op,
1698 1699 1700
					   insert_reserved);
	else
		BUG();
1701 1702 1703
	if (ret && insert_reserved)
		btrfs_pin_extent(trans->fs_info, node->bytenr,
				 node->num_bytes, 1);
1704
	return ret;
1705 1706
}

1707
static inline struct btrfs_delayed_ref_node *
1708 1709
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1710 1711
	struct btrfs_delayed_ref_node *ref;

1712
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1713
		return NULL;
1714

1715 1716 1717 1718 1719 1720
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1721 1722 1723 1724
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1725
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1726
		       struct btrfs_delayed_ref_node, ref_node);
1727 1728
	ASSERT(list_empty(&ref->add_list));
	return ref;
1729 1730
}

1731 1732 1733 1734 1735 1736 1737 1738 1739 1740
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1741 1742
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1743 1744 1745 1746
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1747 1748
		return NULL;

1749
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1750
		head->extent_op = NULL;
1751
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1752
		return NULL;
1753
	}
J
Josef Bacik 已提交
1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1767
	spin_unlock(&head->lock);
1768
	ret = run_delayed_extent_op(trans, head, extent_op);
1769 1770 1771 1772
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1773 1774 1775
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1776
{
J
Josef Bacik 已提交
1777
	int nr_items = 1;	/* Dropping this ref head update. */
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788

	if (head->total_ref_mod < 0) {
		struct btrfs_space_info *space_info;
		u64 flags;

		if (head->is_data)
			flags = BTRFS_BLOCK_GROUP_DATA;
		else if (head->is_system)
			flags = BTRFS_BLOCK_GROUP_SYSTEM;
		else
			flags = BTRFS_BLOCK_GROUP_METADATA;
1789
		space_info = btrfs_find_space_info(fs_info, flags);
1790 1791 1792 1793 1794
		ASSERT(space_info);
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
				   -head->num_bytes,
				   BTRFS_TOTAL_BYTES_PINNED_BATCH);

J
Josef Bacik 已提交
1795 1796 1797 1798 1799
		/*
		 * We had csum deletions accounted for in our delayed refs rsv,
		 * we need to drop the csum leaves for this update from our
		 * delayed_refs_rsv.
		 */
1800 1801 1802 1803
		if (head->is_data) {
			spin_lock(&delayed_refs->lock);
			delayed_refs->pending_csums -= head->num_bytes;
			spin_unlock(&delayed_refs->lock);
J
Josef Bacik 已提交
1804 1805
			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
				head->num_bytes);
1806 1807 1808
		}
	}

J
Josef Bacik 已提交
1809
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1810 1811
}

1812 1813 1814
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1815 1816

	struct btrfs_fs_info *fs_info = trans->fs_info;
1817 1818 1819 1820 1821
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1822
	ret = run_and_cleanup_extent_op(trans, head);
1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1838
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1839 1840 1841 1842
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1843
	btrfs_delete_ref_head(delayed_refs, head);
1844
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1845
	spin_unlock(&delayed_refs->lock);
1846 1847

	if (head->must_insert_reserved) {
1848 1849
		btrfs_pin_extent(fs_info, head->bytenr,
				 head->num_bytes, 1);
1850
		if (head->is_data) {
1851 1852
			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
					      head->num_bytes);
1853 1854 1855
		}
	}

1856
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1857 1858

	trace_run_delayed_ref_head(fs_info, head, 0);
1859
	btrfs_delayed_ref_unlock(head);
1860
	btrfs_put_delayed_ref_head(head);
1861 1862 1863
	return 0;
}

1864 1865 1866 1867 1868 1869 1870 1871 1872
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1873
	head = btrfs_select_ref_head(delayed_refs);
1874 1875 1876 1877 1878 1879 1880 1881 1882
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1883
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1910 1911 1912
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1977 1978 1979 1980
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
1981 1982
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
1983
{
1984
	struct btrfs_fs_info *fs_info = trans->fs_info;
1985 1986
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
1987
	ktime_t start = ktime_get();
1988
	int ret;
1989
	unsigned long count = 0;
1990
	unsigned long actual_count = 0;
1991 1992

	delayed_refs = &trans->transaction->delayed_refs;
1993
	do {
1994
		if (!locked_ref) {
1995
			locked_ref = btrfs_obtain_ref_head(trans);
1996 1997 1998 1999 2000 2001
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2002
			}
2003
			count++;
2004
		}
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2017
		spin_lock(&locked_ref->lock);
2018
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2019

2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2033
			ret = cleanup_ref_head(trans, locked_ref);
2034
			if (ret > 0 ) {
2035 2036
				/* We dropped our lock, we need to loop. */
				ret = 0;
2037
				continue;
2038 2039
			} else if (ret) {
				return ret;
2040
			}
2041
		}
2042

2043
		/*
2044 2045
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2046 2047
		 */

2048
		locked_ref = NULL;
2049
		cond_resched();
2050
	} while ((nr != -1 && count < nr) || locked_ref);
2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2067
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2068 2069
		spin_unlock(&delayed_refs->lock);
	}
2070
	return 0;
2071 2072
}

2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2116
static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2117 2118 2119 2120 2121
{
	u64 num_bytes;

	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
			     sizeof(struct btrfs_extent_inline_ref));
2122
	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2123 2124 2125 2126
		num_bytes += heads * sizeof(struct btrfs_tree_block_info);

	/*
	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2127
	 * closer to what we're really going to want to use.
2128
	 */
2129
	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2130 2131
}

2132 2133 2134 2135
/*
 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 * would require to store the csums for that many bytes.
 */
2136
u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2137 2138 2139 2140 2141
{
	u64 csum_size;
	u64 num_csums_per_leaf;
	u64 num_csums;

2142
	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2143
	num_csums_per_leaf = div64_u64(csum_size,
2144 2145
			(u64)btrfs_super_csum_size(fs_info->super_copy));
	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2146 2147 2148 2149 2150
	num_csums += num_csums_per_leaf - 1;
	num_csums = div64_u64(num_csums, num_csums_per_leaf);
	return num_csums;
}

2151 2152 2153 2154 2155 2156
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2157 2158 2159
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2160 2161
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2162
			   unsigned long count)
2163
{
2164
	struct btrfs_fs_info *fs_info = trans->fs_info;
2165 2166
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2167
	struct btrfs_delayed_ref_head *head;
2168 2169 2170
	int ret;
	int run_all = count == (unsigned long)-1;

2171 2172 2173 2174
	/* We'll clean this up in btrfs_cleanup_transaction */
	if (trans->aborted)
		return 0;

2175
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2176 2177
		return 0;

2178
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2179
	if (count == 0)
2180
		count = atomic_read(&delayed_refs->num_entries) * 2;
2181

2182
again:
2183 2184 2185
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2186
	ret = __btrfs_run_delayed_refs(trans, count);
2187
	if (ret < 0) {
2188
		btrfs_abort_transaction(trans, ret);
2189
		return ret;
2190
	}
2191

2192
	if (run_all) {
2193
		btrfs_create_pending_block_groups(trans);
2194

2195
		spin_lock(&delayed_refs->lock);
2196
		node = rb_first_cached(&delayed_refs->href_root);
2197 2198
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2199
			goto out;
2200
		}
2201 2202 2203 2204
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2205

2206 2207 2208
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2209

2210
		btrfs_put_delayed_ref_head(head);
2211
		cond_resched();
2212
		goto again;
2213
	}
2214
out:
2215 2216 2217
	return 0;
}

2218 2219
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				u64 bytenr, u64 num_bytes, u64 flags,
2220
				int level, int is_data)
2221 2222 2223 2224
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2225
	extent_op = btrfs_alloc_delayed_extent_op();
2226 2227 2228 2229
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2230 2231 2232
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2233
	extent_op->level = level;
2234

2235
	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2236
	if (ret)
2237
		btrfs_free_delayed_extent_op(extent_op);
2238 2239 2240
	return ret;
}

2241
static noinline int check_delayed_ref(struct btrfs_root *root,
2242 2243 2244 2245 2246 2247 2248
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2249
	struct btrfs_transaction *cur_trans;
2250
	struct rb_node *node;
2251 2252
	int ret = 0;

2253
	spin_lock(&root->fs_info->trans_lock);
2254
	cur_trans = root->fs_info->running_transaction;
2255 2256 2257
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2258 2259 2260 2261
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2262
	spin_lock(&delayed_refs->lock);
2263
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2264 2265
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2266
		btrfs_put_transaction(cur_trans);
2267 2268
		return 0;
	}
2269 2270

	if (!mutex_trylock(&head->mutex)) {
2271
		refcount_inc(&head->refs);
2272 2273
		spin_unlock(&delayed_refs->lock);

2274
		btrfs_release_path(path);
2275

2276 2277 2278 2279
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2280 2281
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2282
		btrfs_put_delayed_ref_head(head);
2283
		btrfs_put_transaction(cur_trans);
2284 2285
		return -EAGAIN;
	}
2286
	spin_unlock(&delayed_refs->lock);
2287

2288
	spin_lock(&head->lock);
2289 2290 2291 2292
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2293 2294
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2295
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2296 2297 2298 2299 2300
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2301

2302
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2303

2304 2305 2306 2307 2308 2309 2310 2311 2312 2313
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2314
	}
2315
	spin_unlock(&head->lock);
2316
	mutex_unlock(&head->mutex);
2317
	btrfs_put_transaction(cur_trans);
2318 2319 2320
	return ret;
}

2321
static noinline int check_committed_ref(struct btrfs_root *root,
2322 2323
					struct btrfs_path *path,
					u64 objectid, u64 offset, u64 bytenr)
2324
{
2325 2326
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
2327
	struct extent_buffer *leaf;
2328 2329 2330
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2331
	struct btrfs_key key;
2332
	u32 item_size;
2333
	int type;
2334
	int ret;
2335

2336
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2337
	key.offset = (u64)-1;
2338
	key.type = BTRFS_EXTENT_ITEM_KEY;
2339 2340 2341 2342

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2343
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2344 2345 2346

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2347
		goto out;
2348

Z
Zheng Yan 已提交
2349
	path->slots[0]--;
2350
	leaf = path->nodes[0];
2351
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2352

2353
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2354
		goto out;
2355

2356 2357 2358
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2359

2360
	/* If extent item has more than 1 inline ref then it's shared */
2361 2362 2363
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2364

2365
	/* If extent created before last snapshot => it's definitely shared */
2366 2367 2368 2369 2370
	if (btrfs_extent_generation(leaf, ei) <=
	    btrfs_root_last_snapshot(&root->root_item))
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2371

2372
	/* If this extent has SHARED_DATA_REF then it's shared */
2373 2374
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2391 2392
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
			  u64 bytenr)
2393 2394 2395 2396 2397 2398
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2399
		return -ENOMEM;
2400 2401

	do {
2402
		ret = check_committed_ref(root, path, objectid,
2403 2404
					  offset, bytenr);
		if (ret && ret != -ENOENT)
2405
			goto out;
Y
Yan Zheng 已提交
2406

2407 2408
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2409

2410
out:
Y
Yan Zheng 已提交
2411
	btrfs_free_path(path);
2412 2413
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
2414
	return ret;
2415
}
C
Chris Mason 已提交
2416

2417
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2418
			   struct btrfs_root *root,
2419
			   struct extent_buffer *buf,
2420
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2421
{
2422
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2423
	u64 bytenr;
2424 2425
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2426 2427 2428 2429
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2430 2431
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2432
	int i;
2433
	int action;
Z
Zheng Yan 已提交
2434 2435
	int level;
	int ret = 0;
2436

2437
	if (btrfs_is_testing(fs_info))
2438
		return 0;
2439

Z
Zheng Yan 已提交
2440 2441 2442 2443
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2444
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2445
		return 0;
Z
Zheng Yan 已提交
2446

2447 2448 2449 2450
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2451 2452 2453 2454
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2455 2456

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2457
		if (level == 0) {
2458
			btrfs_item_key_to_cpu(buf, &key, i);
2459
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2460
				continue;
2461
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2462 2463 2464 2465 2466 2467 2468
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2469 2470 2471

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2472 2473 2474 2475 2476 2477
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
					    key.offset);
			generic_ref.skip_qgroup = for_reloc;
2478
			if (inc)
2479
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2480
			else
2481
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2482 2483 2484
			if (ret)
				goto fail;
		} else {
2485
			bytenr = btrfs_node_blockptr(buf, i);
2486
			num_bytes = fs_info->nodesize;
2487 2488 2489 2490 2491
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
			generic_ref.skip_qgroup = for_reloc;
2492
			if (inc)
2493
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2494
			else
2495
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2496 2497 2498 2499 2500 2501
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2502 2503 2504 2505
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2506
		  struct extent_buffer *buf, int full_backref)
2507
{
2508
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2509 2510 2511
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2512
		  struct extent_buffer *buf, int full_backref)
2513
{
2514
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2515 2516
}

2517
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2518 2519 2520 2521
{
	struct btrfs_block_group_cache *block_group;
	int readonly = 0;

2522
	block_group = btrfs_lookup_block_group(fs_info, bytenr);
2523 2524 2525
	if (!block_group || block_group->ro)
		readonly = 1;
	if (block_group)
2526
		btrfs_put_block_group(block_group);
2527 2528 2529
	return readonly;
}

2530
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2531
{
2532
	struct btrfs_fs_info *fs_info = root->fs_info;
2533
	u64 flags;
D
David Woodhouse 已提交
2534
	u64 ret;
J
Josef Bacik 已提交
2535

2536 2537
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2538
	else if (root == fs_info->chunk_root)
2539
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2540
	else
2541
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2542

2543
	ret = btrfs_get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2544
	return ret;
J
Josef Bacik 已提交
2545
}
J
Josef Bacik 已提交
2546

2547
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2548
{
J
Josef Bacik 已提交
2549
	struct btrfs_block_group_cache *cache;
2550
	u64 bytenr;
J
Josef Bacik 已提交
2551

2552 2553 2554
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2555 2556 2557 2558

	if (bytenr < (u64)-1)
		return bytenr;

2559
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2560
	if (!cache)
2561
		return 0;
J
Josef Bacik 已提交
2562

2563
	bytenr = cache->key.objectid;
2564
	btrfs_put_block_group(cache);
2565 2566

	return bytenr;
2567 2568
}

2569
static int pin_down_extent(struct btrfs_block_group_cache *cache,
2570
			   u64 bytenr, u64 num_bytes, int reserved)
2571
{
2572 2573
	struct btrfs_fs_info *fs_info = cache->fs_info;

2574 2575 2576
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2577 2578
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2579 2580 2581 2582 2583 2584
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2585

2586 2587
	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2588
	set_extent_dirty(fs_info->pinned_extents, bytenr,
2589 2590 2591
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2592

2593 2594 2595
/*
 * this function must be called within transaction
 */
2596
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2597 2598 2599
		     u64 bytenr, u64 num_bytes, int reserved)
{
	struct btrfs_block_group_cache *cache;
J
Josef Bacik 已提交
2600

2601
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2602
	BUG_ON(!cache); /* Logic error */
2603

2604
	pin_down_extent(cache, bytenr, num_bytes, reserved);
2605 2606

	btrfs_put_block_group(cache);
2607 2608 2609
	return 0;
}

2610
/*
2611 2612
 * this function must be called within transaction
 */
2613
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2614 2615 2616
				    u64 bytenr, u64 num_bytes)
{
	struct btrfs_block_group_cache *cache;
2617
	int ret;
2618

2619
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2620 2621
	if (!cache)
		return -EINVAL;
2622 2623 2624 2625 2626 2627 2628

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2629
	btrfs_cache_block_group(cache, 1);
2630

2631
	pin_down_extent(cache, bytenr, num_bytes, 0);
2632 2633

	/* remove us from the free space cache (if we're there at all) */
2634
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2635
	btrfs_put_block_group(cache);
2636
	return ret;
2637 2638
}

2639 2640
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2641 2642 2643 2644 2645
{
	int ret;
	struct btrfs_block_group_cache *block_group;
	struct btrfs_caching_control *caching_ctl;

2646
	block_group = btrfs_lookup_block_group(fs_info, start);
2647 2648 2649
	if (!block_group)
		return -EINVAL;

2650
	btrfs_cache_block_group(block_group, 0);
2651
	caching_ctl = btrfs_get_caching_control(block_group);
2652 2653 2654

	if (!caching_ctl) {
		/* Logic error */
2655
		BUG_ON(!btrfs_block_group_cache_done(block_group));
2656 2657 2658 2659 2660
		ret = btrfs_remove_free_space(block_group, start, num_bytes);
	} else {
		mutex_lock(&caching_ctl->mutex);

		if (start >= caching_ctl->progress) {
2661 2662
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675
		} else if (start + num_bytes <= caching_ctl->progress) {
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
		} else {
			num_bytes = caching_ctl->progress - start;
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
			if (ret)
				goto out_lock;

			num_bytes = (start + num_bytes) -
				caching_ctl->progress;
			start = caching_ctl->progress;
2676 2677
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2678 2679 2680
		}
out_lock:
		mutex_unlock(&caching_ctl->mutex);
2681
		btrfs_put_caching_control(caching_ctl);
2682 2683 2684 2685 2686
	}
	btrfs_put_block_group(block_group);
	return ret;
}

2687
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2688
{
2689
	struct btrfs_fs_info *fs_info = eb->fs_info;
2690 2691 2692 2693
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2694
	int ret = 0;
2695

2696
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2711 2712 2713
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2714 2715
	}

2716
	return ret;
2717 2718
}

2719 2720 2721 2722 2723 2724
static void
btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
{
	atomic_inc(&bg->reservations);
}

2725
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
2726
{
2727 2728 2729
	struct btrfs_caching_control *next;
	struct btrfs_caching_control *caching_ctl;
	struct btrfs_block_group_cache *cache;
2730

2731
	down_write(&fs_info->commit_root_sem);
2732

2733 2734 2735
	list_for_each_entry_safe(caching_ctl, next,
				 &fs_info->caching_block_groups, list) {
		cache = caching_ctl->block_group;
2736
		if (btrfs_block_group_cache_done(cache)) {
2737 2738
			cache->last_byte_to_unpin = (u64)-1;
			list_del_init(&caching_ctl->list);
2739
			btrfs_put_caching_control(caching_ctl);
2740
		} else {
2741
			cache->last_byte_to_unpin = caching_ctl->progress;
2742 2743
		}
	}
2744 2745 2746 2747 2748 2749

	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		fs_info->pinned_extents = &fs_info->freed_extents[1];
	else
		fs_info->pinned_extents = &fs_info->freed_extents[0];

2750
	up_write(&fs_info->commit_root_sem);
2751

2752
	btrfs_update_global_block_rsv(fs_info);
2753 2754
}

2755 2756 2757 2758 2759
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2760 2761
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2762 2763 2764 2765 2766 2767 2768 2769
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2770
		ret = &fs_info->meta_alloc_cluster;
2771 2772 2773
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2774
			*empty_cluster = SZ_64K;
2775 2776 2777
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2778
		ret = &fs_info->data_alloc_cluster;
2779 2780 2781 2782 2783
	}

	return ret;
}

2784 2785
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2786
			      const bool return_free_space)
C
Chris Mason 已提交
2787
{
2788
	struct btrfs_block_group_cache *cache = NULL;
2789 2790
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2791
	struct btrfs_free_cluster *cluster = NULL;
2792
	u64 len;
2793 2794
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2795
	bool readonly;
C
Chris Mason 已提交
2796

2797
	while (start <= end) {
2798
		readonly = false;
2799 2800 2801 2802
		if (!cache ||
		    start >= cache->key.objectid + cache->key.offset) {
			if (cache)
				btrfs_put_block_group(cache);
2803
			total_unpinned = 0;
2804
			cache = btrfs_lookup_block_group(fs_info, start);
2805
			BUG_ON(!cache); /* Logic error */
2806

2807
			cluster = fetch_cluster_info(fs_info,
2808 2809 2810
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2811 2812 2813 2814 2815 2816 2817
		}

		len = cache->key.objectid + cache->key.offset - start;
		len = min(len, end + 1 - start);

		if (start < cache->last_byte_to_unpin) {
			len = min(len, cache->last_byte_to_unpin - start);
2818 2819
			if (return_free_space)
				btrfs_add_free_space(cache, start, len);
2820 2821
		}

2822
		start += len;
2823
		total_unpinned += len;
2824
		space_info = cache->space_info;
2825

2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2839
		spin_lock(&space_info->lock);
2840 2841
		spin_lock(&cache->lock);
		cache->pinned -= len;
2842
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2843
		space_info->max_extent_size = 0;
2844 2845
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2846 2847 2848 2849
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
		}
2850
		spin_unlock(&cache->lock);
2851 2852 2853
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2854

2855 2856
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2857 2858 2859
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2860 2861
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2862 2863
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2864
				len -= to_add;
2865 2866
			}
			spin_unlock(&global_rsv->lock);
2867 2868
			/* Add to any tickets we may have */
			if (len)
2869 2870
				btrfs_try_granting_tickets(fs_info,
							   space_info);
2871 2872
		}
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2873
	}
2874 2875 2876

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
2877 2878 2879
	return 0;
}

2880
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2881
{
2882
	struct btrfs_fs_info *fs_info = trans->fs_info;
2883 2884
	struct btrfs_block_group_cache *block_group, *tmp;
	struct list_head *deleted_bgs;
2885
	struct extent_io_tree *unpin;
2886 2887
	u64 start;
	u64 end;
2888 2889
	int ret;

2890 2891 2892 2893 2894
	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		unpin = &fs_info->freed_extents[1];
	else
		unpin = &fs_info->freed_extents[0];

2895
	while (!trans->aborted) {
2896 2897
		struct extent_state *cached_state = NULL;

2898
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
2899
		ret = find_first_extent_bit(unpin, 0, &start, &end,
2900
					    EXTENT_DIRTY, &cached_state);
2901 2902
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2903
			break;
2904
		}
2905

2906
		if (btrfs_test_opt(fs_info, DISCARD))
2907
			ret = btrfs_discard_extent(fs_info, start,
2908
						   end + 1 - start, NULL);
2909

2910
		clear_extent_dirty(unpin, start, end, &cached_state);
2911
		unpin_extent_range(fs_info, start, end, true);
2912
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2913
		free_extent_state(cached_state);
2914
		cond_resched();
2915
	}
J
Josef Bacik 已提交
2916

2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
		if (!trans->aborted)
2928
			ret = btrfs_discard_extent(fs_info,
2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939
						   block_group->key.objectid,
						   block_group->key.offset,
						   &trimmed);

		list_del_init(&block_group->bg_list);
		btrfs_put_block_group_trimming(block_group);
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
2940
			   "discard failed while removing blockgroup: errno=%d %s",
2941 2942 2943 2944
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
2945 2946 2947
	return 0;
}

2948
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2949 2950 2951 2952
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
2953
{
2954
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
2955
	struct btrfs_key key;
2956
	struct btrfs_path *path;
2957
	struct btrfs_root *extent_root = info->extent_root;
2958
	struct extent_buffer *leaf;
2959 2960
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
2961
	int ret;
2962
	int is_data;
2963 2964 2965
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
2966 2967
	u32 item_size;
	u64 refs;
2968 2969
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
2970
	int last_ref = 0;
2971
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
2972

2973
	path = btrfs_alloc_path();
2974 2975
	if (!path)
		return -ENOMEM;
2976

2977
	path->reada = READA_FORWARD;
2978
	path->leave_spinning = 1;
2979 2980 2981 2982

	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
	BUG_ON(!is_data && refs_to_drop != 1);

2983
	if (is_data)
2984
		skinny_metadata = false;
2985

2986 2987
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
2988
				    owner_offset);
2989
	if (ret == 0) {
2990
		extent_slot = path->slots[0];
2991 2992
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
2993
					      extent_slot);
2994
			if (key.objectid != bytenr)
2995
				break;
2996 2997
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
2998 2999 3000
				found_extent = 1;
				break;
			}
3001 3002 3003 3004 3005
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
3006 3007
			if (path->slots[0] - extent_slot > 5)
				break;
3008
			extent_slot--;
3009
		}
3010

Z
Zheng Yan 已提交
3011
		if (!found_extent) {
3012
			BUG_ON(iref);
3013
			ret = remove_extent_backref(trans, path, NULL,
3014
						    refs_to_drop,
J
Josef Bacik 已提交
3015
						    is_data, &last_ref);
3016
			if (ret) {
3017
				btrfs_abort_transaction(trans, ret);
3018 3019
				goto out;
			}
3020
			btrfs_release_path(path);
3021
			path->leave_spinning = 1;
3022 3023 3024 3025 3026

			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3027 3028 3029 3030 3031
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3032 3033
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3050
				key.objectid = bytenr;
3051 3052 3053 3054 3055 3056 3057
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3058
			if (ret) {
J
Jeff Mahoney 已提交
3059 3060 3061
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3062
				if (ret > 0)
3063
					btrfs_print_leaf(path->nodes[0]);
3064
			}
3065
			if (ret < 0) {
3066
				btrfs_abort_transaction(trans, ret);
3067 3068
				goto out;
			}
Z
Zheng Yan 已提交
3069 3070
			extent_slot = path->slots[0];
		}
3071
	} else if (WARN_ON(ret == -ENOENT)) {
3072
		btrfs_print_leaf(path->nodes[0]);
3073 3074
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3075 3076
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3077
		btrfs_abort_transaction(trans, ret);
3078
		goto out;
3079
	} else {
3080
		btrfs_abort_transaction(trans, ret);
3081
		goto out;
3082
	}
3083 3084

	leaf = path->nodes[0];
3085
	item_size = btrfs_item_size_nr(leaf, extent_slot);
3086
	if (unlikely(item_size < sizeof(*ei))) {
3087 3088 3089 3090 3091
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3092
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3093
			    struct btrfs_extent_item);
3094 3095
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3096 3097 3098 3099 3100
		struct btrfs_tree_block_info *bi;
		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3101

3102
	refs = btrfs_extent_refs(leaf, ei);
3103
	if (refs < refs_to_drop) {
J
Jeff Mahoney 已提交
3104 3105 3106
		btrfs_err(info,
			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
			  refs_to_drop, refs, bytenr);
3107
		ret = -EINVAL;
3108
		btrfs_abort_transaction(trans, ret);
3109 3110
		goto out;
	}
3111
	refs -= refs_to_drop;
3112

3113 3114 3115 3116 3117 3118
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3119
		 */
3120 3121 3122 3123 3124 3125 3126
		if (iref) {
			BUG_ON(!found_extent);
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3127 3128 3129
			ret = remove_extent_backref(trans, path, iref,
						    refs_to_drop, is_data,
						    &last_ref);
3130
			if (ret) {
3131
				btrfs_abort_transaction(trans, ret);
3132 3133
				goto out;
			}
3134
		}
3135 3136 3137
	} else {
		if (found_extent) {
			BUG_ON(is_data && refs_to_drop !=
3138
			       extent_data_ref_count(path, iref));
3139 3140 3141 3142 3143 3144 3145
			if (iref) {
				BUG_ON(path->slots[0] != extent_slot);
			} else {
				BUG_ON(path->slots[0] != extent_slot + 1);
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3146
		}
3147

J
Josef Bacik 已提交
3148
		last_ref = 1;
3149 3150
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3151
		if (ret) {
3152
			btrfs_abort_transaction(trans, ret);
3153 3154
			goto out;
		}
3155
		btrfs_release_path(path);
3156

3157
		if (is_data) {
3158
			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
3159
			if (ret) {
3160
				btrfs_abort_transaction(trans, ret);
3161 3162
				goto out;
			}
3163 3164
		}

3165
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3166
		if (ret) {
3167
			btrfs_abort_transaction(trans, ret);
3168 3169 3170
			goto out;
		}

3171
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3172
		if (ret) {
3173
			btrfs_abort_transaction(trans, ret);
3174 3175
			goto out;
		}
3176
	}
J
Josef Bacik 已提交
3177 3178
	btrfs_release_path(path);

3179
out:
3180
	btrfs_free_path(path);
3181 3182 3183
	return ret;
}

3184
/*
3185
 * when we free an block, it is possible (and likely) that we free the last
3186 3187 3188 3189 3190
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3191
				      u64 bytenr)
3192 3193 3194
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3195
	int ret = 0;
3196 3197 3198

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3199
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3200
	if (!head)
3201
		goto out_delayed_unlock;
3202

3203
	spin_lock(&head->lock);
3204
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3205 3206
		goto out;

J
Josef Bacik 已提交
3207 3208
	if (cleanup_extent_op(head) != NULL)
		goto out;
3209

3210 3211 3212 3213 3214 3215 3216
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3217
	btrfs_delete_ref_head(delayed_refs, head);
3218
	head->processing = 0;
3219

3220
	spin_unlock(&head->lock);
3221 3222
	spin_unlock(&delayed_refs->lock);

3223 3224 3225 3226
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3227
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3228
	mutex_unlock(&head->mutex);
3229
	btrfs_put_delayed_ref_head(head);
3230
	return ret;
3231
out:
3232
	spin_unlock(&head->lock);
3233 3234

out_delayed_unlock:
3235 3236 3237 3238
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3239 3240 3241
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
3242
			   u64 parent, int last_ref)
3243
{
3244
	struct btrfs_fs_info *fs_info = root->fs_info;
3245
	struct btrfs_ref generic_ref = { 0 };
3246
	int pin = 1;
3247 3248
	int ret;

3249 3250 3251 3252 3253
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
			    root->root_key.objectid);

3254
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3255 3256
		int old_ref_mod, new_ref_mod;

3257
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3258
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
3259
						 &old_ref_mod, &new_ref_mod);
3260
		BUG_ON(ret); /* -ENOMEM */
3261
		pin = old_ref_mod >= 0 && new_ref_mod < 0;
3262 3263
	}

3264
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3265 3266
		struct btrfs_block_group_cache *cache;

3267
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3268
			ret = check_ref_cleanup(trans, buf->start);
3269
			if (!ret)
3270
				goto out;
3271 3272
		}

3273
		pin = 0;
3274
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3275

3276
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3277
			pin_down_extent(cache, buf->start, buf->len, 1);
3278
			btrfs_put_block_group(cache);
3279
			goto out;
3280 3281 3282 3283 3284
		}

		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3285
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3286
		btrfs_put_block_group(cache);
3287
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3288 3289
	}
out:
3290
	if (pin)
3291
		add_pinned_bytes(fs_info, &generic_ref);
3292

3293 3294 3295 3296 3297 3298 3299
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3300 3301
}

3302
/* Can return -ENOMEM */
3303
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3304
{
3305
	struct btrfs_fs_info *fs_info = trans->fs_info;
3306
	int old_ref_mod, new_ref_mod;
3307 3308
	int ret;

3309
	if (btrfs_is_testing(fs_info))
3310
		return 0;
3311

3312 3313 3314 3315
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3316 3317 3318 3319
	if ((ref->type == BTRFS_REF_METADATA &&
	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	    (ref->type == BTRFS_REF_DATA &&
	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3320
		/* unlocks the pinned mutex */
3321
		btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
3322
		old_ref_mod = new_ref_mod = 0;
3323
		ret = 0;
3324 3325
	} else if (ref->type == BTRFS_REF_METADATA) {
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
3326
						 &old_ref_mod, &new_ref_mod);
3327
	} else {
3328
		ret = btrfs_add_delayed_data_ref(trans, ref, 0,
3329
						 &old_ref_mod, &new_ref_mod);
3330
	}
3331

3332 3333 3334 3335 3336
	if (!((ref->type == BTRFS_REF_METADATA &&
	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	      (ref->type == BTRFS_REF_DATA &&
	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
		btrfs_ref_tree_mod(fs_info, ref);
3337

3338
	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
3339
		add_pinned_bytes(fs_info, ref);
3340

3341 3342 3343
	return ret;
}

J
Josef Bacik 已提交
3344
enum btrfs_loop_type {
3345 3346 3347 3348
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3349 3350
};

3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372
static inline void
btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static inline void
btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static struct btrfs_block_group_cache *
btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
{
S
Sudip Mukherjee 已提交
3373
	struct btrfs_block_group_cache *used_bg = NULL;
3374

3375
	spin_lock(&cluster->refill_lock);
3376 3377 3378 3379 3380 3381
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3382 3383
			return used_bg;

3384
		btrfs_get_block_group(used_bg);
3385

3386 3387
		if (!delalloc)
			return used_bg;
3388

3389 3390
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3391

3392
		spin_unlock(&cluster->refill_lock);
3393

3394 3395
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3396

3397 3398 3399
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3400

3401 3402 3403
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414
}

static inline void
btrfs_release_block_group(struct btrfs_block_group_cache *cache,
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
	u64 ram_bytes;
	u64 num_bytes;
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;

	bool have_caching_bg;
	bool orig_have_caching_bg;

	/* RAID index, converted from flags */
	int index;

3439 3440 3441
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
};

3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl,
		struct btrfs_block_group_cache **cluster_bg_ret)
{
	struct btrfs_block_group_cache *cluster_bg;
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
			ffe_ctl->num_bytes, cluster_bg->key.objectid,
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3542 3543
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3563
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl)
{
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3630 3631
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3632 3633 3634 3635 3636 3637 3638 3639 3640
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_free_cluster *last_ptr,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
					int full_search, bool use_cluster)
{
	struct btrfs_root *root = fs_info->extent_root;
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	    ffe_ctl->have_caching_bg)
		return 1;

	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
		return 1;

	if (ins->objectid) {
		if (!use_cluster && last_ptr) {
			spin_lock(&last_ptr->lock);
			last_ptr->window_start = ins->objectid;
			spin_unlock(&last_ptr->lock);
		}
		return 0;
	}

	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

3714 3715
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751

			/*
			 * If we can't allocate a new chunk we've already looped
			 * through at least once, move on to the NO_EMPTY_SIZE
			 * case.
			 */
			if (ret == -ENOSPC)
				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;

			/* Do not bail out on ENOSPC since we can do more. */
			if (ret < 0 && ret != -ENOSPC)
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

3752 3753 3754
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
3755
 * ins->objectid == start position
3756
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
3757
 * ins->offset == the size of the hole.
3758
 * Any available blocks before search_start are skipped.
3759 3760 3761
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
3776
 */
3777
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
3778 3779 3780
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
				u64 hint_byte, struct btrfs_key *ins,
				u64 flags, int delalloc)
3781
{
3782
	int ret = 0;
3783
	struct btrfs_free_cluster *last_ptr = NULL;
3784
	struct btrfs_block_group_cache *block_group = NULL;
3785
	struct find_free_extent_ctl ffe_ctl = {0};
3786
	struct btrfs_space_info *space_info;
3787
	bool use_cluster = true;
3788
	bool full_search = false;
3789

3790
	WARN_ON(num_bytes < fs_info->sectorsize);
3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804

	ffe_ctl.ram_bytes = ram_bytes;
	ffe_ctl.num_bytes = num_bytes;
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.search_start = 0;
	ffe_ctl.retry_clustered = false;
	ffe_ctl.retry_unclustered = false;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	ffe_ctl.have_caching_bg = false;
	ffe_ctl.orig_have_caching_bg = false;
	ffe_ctl.found_offset = 0;

3805
	ins->type = BTRFS_EXTENT_ITEM_KEY;
3806 3807
	ins->objectid = 0;
	ins->offset = 0;
3808

3809
	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
3810

3811
	space_info = btrfs_find_space_info(fs_info, flags);
3812
	if (!space_info) {
3813
		btrfs_err(fs_info, "No space info for %llu", flags);
3814 3815
		return -ENOSPC;
	}
J
Josef Bacik 已提交
3816

3817
	/*
3818 3819 3820 3821 3822 3823 3824 3825
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
3826
	 */
3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837
	if (unlikely(space_info->max_extent_size)) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			use_cluster = false;
		}
		spin_unlock(&space_info->lock);
3838
	}
J
Josef Bacik 已提交
3839

3840 3841
	last_ptr = fetch_cluster_info(fs_info, space_info,
				      &ffe_ctl.empty_cluster);
3842
	if (last_ptr) {
3843 3844 3845
		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			hint_byte = last_ptr->window_start;
3846 3847 3848 3849 3850 3851 3852 3853 3854
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			hint_byte = last_ptr->window_start;
			use_cluster = false;
		}
3855
		spin_unlock(&last_ptr->lock);
3856
	}
3857

3858 3859 3860 3861 3862 3863
	ffe_ctl.search_start = max(ffe_ctl.search_start,
				   first_logical_byte(fs_info, 0));
	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
	if (ffe_ctl.search_start == hint_byte) {
		block_group = btrfs_lookup_block_group(fs_info,
						       ffe_ctl.search_start);
J
Josef Bacik 已提交
3864 3865 3866
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
3867 3868 3869
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
3870
		 */
3871
		if (block_group && block_group_bits(block_group, flags) &&
3872
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
3873
			down_read(&space_info->groups_sem);
3874 3875 3876 3877 3878 3879 3880 3881 3882 3883
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
3884
			} else {
3885
				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3886
						block_group->flags);
3887
				btrfs_lock_block_group(block_group, delalloc);
3888
				goto have_block_group;
3889
			}
J
Josef Bacik 已提交
3890
		} else if (block_group) {
3891
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
3892
		}
3893
	}
J
Josef Bacik 已提交
3894
search:
3895 3896 3897
	ffe_ctl.have_caching_bg = false;
	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
	    ffe_ctl.index == 0)
3898
		full_search = true;
3899
	down_read(&space_info->groups_sem);
3900 3901
	list_for_each_entry(block_group,
			    &space_info->block_groups[ffe_ctl.index], list) {
3902 3903 3904 3905
		/* If the block group is read-only, we can skip it entirely. */
		if (unlikely(block_group->ro))
			continue;

3906
		btrfs_grab_block_group(block_group, delalloc);
3907
		ffe_ctl.search_start = block_group->key.objectid;
3908

3909 3910 3911 3912 3913
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
3914
		if (!block_group_bits(block_group, flags)) {
3915
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
3916
				BTRFS_BLOCK_GROUP_RAID1_MASK |
3917
				BTRFS_BLOCK_GROUP_RAID56_MASK |
3918 3919 3920 3921 3922 3923 3924
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
3925
			if ((flags & extra) && !(block_group->flags & extra))
3926
				goto loop;
3927 3928 3929 3930 3931 3932 3933 3934

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
			btrfs_release_block_group(block_group, delalloc);
			continue;
3935 3936
		}

J
Josef Bacik 已提交
3937
have_block_group:
3938
		ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
3939 3940
		if (unlikely(!ffe_ctl.cached)) {
			ffe_ctl.have_caching_bg = true;
3941
			ret = btrfs_cache_block_group(block_group, 0);
3942 3943
			BUG_ON(ret < 0);
			ret = 0;
J
Josef Bacik 已提交
3944 3945
		}

3946 3947
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
3948

3949
		/*
3950 3951
		 * Ok we want to try and use the cluster allocator, so
		 * lets look there
3952
		 */
3953
		if (last_ptr && use_cluster) {
3954
			struct btrfs_block_group_cache *cluster_bg = NULL;
3955

3956 3957
			ret = find_free_extent_clustered(block_group, last_ptr,
							 &ffe_ctl, &cluster_bg);
3958

3959
			if (ret == 0) {
3960 3961 3962 3963
				if (cluster_bg && cluster_bg != block_group) {
					btrfs_release_block_group(block_group,
								  delalloc);
					block_group = cluster_bg;
3964
				}
3965 3966
				goto checks;
			} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
3967
				goto have_block_group;
3968 3969
			} else if (ret > 0) {
				goto loop;
3970
			}
3971
			/* ret == -ENOENT case falls through */
3972 3973
		}

3974 3975 3976
		ret = find_free_extent_unclustered(block_group, last_ptr,
						   &ffe_ctl);
		if (ret == -EAGAIN)
J
Josef Bacik 已提交
3977
			goto have_block_group;
3978
		else if (ret > 0)
3979
			goto loop;
3980
		/* ret == 0 case falls through */
3981
checks:
3982 3983
		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
					     fs_info->stripesize);
3984

J
Josef Bacik 已提交
3985
		/* move on to the next group */
3986
		if (ffe_ctl.search_start + num_bytes >
3987
		    block_group->key.objectid + block_group->key.offset) {
3988 3989
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
3990
			goto loop;
3991
		}
3992

3993 3994 3995
		if (ffe_ctl.found_offset < ffe_ctl.search_start)
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				ffe_ctl.search_start - ffe_ctl.found_offset);
J
Josef Bacik 已提交
3996

3997 3998
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
3999
		if (ret == -EAGAIN) {
4000 4001
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
4002
			goto loop;
J
Josef Bacik 已提交
4003
		}
4004
		btrfs_inc_block_group_reservations(block_group);
4005

4006
		/* we are all good, lets return */
4007
		ins->objectid = ffe_ctl.search_start;
J
Josef Bacik 已提交
4008
		ins->offset = num_bytes;
4009

4010 4011
		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
					   num_bytes);
4012
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
4013 4014
		break;
loop:
4015 4016
		ffe_ctl.retry_clustered = false;
		ffe_ctl.retry_unclustered = false;
4017
		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4018
		       ffe_ctl.index);
4019
		btrfs_release_block_group(block_group, delalloc);
4020
		cond_resched();
J
Josef Bacik 已提交
4021 4022 4023
	}
	up_read(&space_info->groups_sem);

4024 4025 4026
	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
					   full_search, use_cluster);
	if (ret > 0)
4027 4028
		goto search;

4029
	if (ret == -ENOSPC) {
4030 4031 4032 4033 4034 4035
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
		if (!ffe_ctl.max_extent_size)
			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4036
		spin_lock(&space_info->lock);
4037
		space_info->max_extent_size = ffe_ctl.max_extent_size;
4038
		spin_unlock(&space_info->lock);
4039
		ins->offset = ffe_ctl.max_extent_size;
4040
	}
C
Chris Mason 已提交
4041
	return ret;
4042
}
4043

4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4089
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4090 4091
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4092
			 struct btrfs_key *ins, int is_data, int delalloc)
4093
{
4094
	struct btrfs_fs_info *fs_info = root->fs_info;
4095
	bool final_tried = num_bytes == min_alloc_size;
4096
	u64 flags;
4097
	int ret;
4098

4099
	flags = get_alloc_profile_by_root(root, is_data);
4100
again:
4101
	WARN_ON(num_bytes < fs_info->sectorsize);
4102
	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
4103
			       hint_byte, ins, flags, delalloc);
4104
	if (!ret && !is_data) {
4105
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4106
	} else if (ret == -ENOSPC) {
4107 4108
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4109
			num_bytes = round_down(num_bytes,
4110
					       fs_info->sectorsize);
4111
			num_bytes = max(num_bytes, min_alloc_size);
4112
			ram_bytes = num_bytes;
4113 4114 4115
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4116
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4117 4118
			struct btrfs_space_info *sinfo;

4119
			sinfo = btrfs_find_space_info(fs_info, flags);
4120
			btrfs_err(fs_info,
J
Jeff Mahoney 已提交
4121 4122
				  "allocation failed flags %llu, wanted %llu",
				  flags, num_bytes);
4123
			if (sinfo)
4124 4125
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4126
		}
4127
	}
J
Josef Bacik 已提交
4128 4129

	return ret;
4130 4131
}

4132
static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4133 4134
					u64 start, u64 len,
					int pin, int delalloc)
4135
{
J
Josef Bacik 已提交
4136
	struct btrfs_block_group_cache *cache;
4137
	int ret = 0;
J
Josef Bacik 已提交
4138

4139
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4140
	if (!cache) {
4141 4142
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4143 4144
		return -ENOSPC;
	}
4145

4146
	if (pin)
4147
		pin_down_extent(cache, start, len, 1);
4148
	else {
4149
		if (btrfs_test_opt(fs_info, DISCARD))
4150
			ret = btrfs_discard_extent(fs_info, start, len, NULL);
4151
		btrfs_add_free_space(cache, start, len);
4152
		btrfs_free_reserved_bytes(cache, len, delalloc);
4153
		trace_btrfs_reserved_extent_free(fs_info, start, len);
4154
	}
4155

4156
	btrfs_put_block_group(cache);
4157 4158 4159
	return ret;
}

4160
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4161
			       u64 start, u64 len, int delalloc)
4162
{
4163
	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
4164 4165
}

4166
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
4167 4168
				       u64 start, u64 len)
{
4169
	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
4170 4171
}

4172 4173 4174 4175
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4176
{
4177
	struct btrfs_fs_info *fs_info = trans->fs_info;
4178 4179
	int ret;
	struct btrfs_extent_item *extent_item;
4180
	struct btrfs_extent_inline_ref *iref;
4181
	struct btrfs_path *path;
4182 4183 4184
	struct extent_buffer *leaf;
	int type;
	u32 size;
4185

4186 4187 4188 4189
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4190

4191
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4192 4193

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4194 4195
	if (!path)
		return -ENOMEM;
4196

4197
	path->leave_spinning = 1;
4198 4199
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
4200 4201 4202 4203
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4204

4205 4206
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4207
				     struct btrfs_extent_item);
4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4228 4229

	btrfs_mark_buffer_dirty(path->nodes[0]);
4230
	btrfs_free_path(path);
4231

4232
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4233 4234 4235
	if (ret)
		return ret;

4236
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4237
	if (ret) { /* -ENOENT, logic error */
4238
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4239
			ins->objectid, ins->offset);
4240 4241
		BUG();
	}
4242
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4243 4244 4245
	return ret;
}

4246
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4247
				     struct btrfs_delayed_ref_node *node,
4248
				     struct btrfs_delayed_extent_op *extent_op)
4249
{
4250
	struct btrfs_fs_info *fs_info = trans->fs_info;
4251
	int ret;
4252
	struct btrfs_extent_item *extent_item;
4253
	struct btrfs_key extent_key;
4254 4255 4256 4257
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4258
	struct btrfs_delayed_tree_ref *ref;
4259
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4260
	u64 num_bytes;
4261
	u64 flags = extent_op->flags_to_set;
4262
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4263

4264 4265 4266 4267 4268 4269 4270 4271 4272 4273
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4274
		size += sizeof(*block_info);
4275 4276
		num_bytes = node->num_bytes;
	}
4277

4278
	path = btrfs_alloc_path();
4279
	if (!path)
4280
		return -ENOMEM;
4281

4282 4283
	path->leave_spinning = 1;
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4284
				      &extent_key, size);
4285
	if (ret) {
4286
		btrfs_free_path(path);
4287 4288
		return ret;
	}
4289 4290 4291 4292 4293 4294 4295 4296 4297

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4298 4299 4300 4301
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4302
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4303
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4304 4305
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4306

4307
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4308 4309 4310
		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4311
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4312 4313 4314
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4315
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4316 4317 4318 4319 4320
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4321 4322
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4323 4324 4325
	if (ret)
		return ret;

4326 4327
	ret = btrfs_update_block_group(trans, extent_key.objectid,
				       fs_info->nodesize, 1);
4328
	if (ret) { /* -ENOENT, logic error */
4329
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4330
			extent_key.objectid, extent_key.offset);
4331 4332
		BUG();
	}
J
Josef Bacik 已提交
4333

4334
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4335
					  fs_info->nodesize);
4336 4337 4338 4339
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4340
				     struct btrfs_root *root, u64 owner,
4341 4342
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4343
{
4344
	struct btrfs_ref generic_ref = { 0 };
4345 4346
	int ret;

4347
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4348

4349 4350 4351
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4352
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4353 4354
	ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
					 ram_bytes, NULL, NULL);
4355 4356
	return ret;
}
4357 4358 4359 4360 4361 4362

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4363 4364 4365
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4366
{
4367
	struct btrfs_fs_info *fs_info = trans->fs_info;
4368 4369
	int ret;
	struct btrfs_block_group_cache *block_group;
4370
	struct btrfs_space_info *space_info;
4371

4372 4373
	/*
	 * Mixed block groups will exclude before processing the log so we only
4374
	 * need to do the exclude dance if this fs isn't mixed.
4375
	 */
4376
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4377 4378
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4379
		if (ret)
4380
			return ret;
4381 4382
	}

4383
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4384 4385 4386
	if (!block_group)
		return -EINVAL;

4387 4388 4389 4390 4391 4392 4393 4394
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4395 4396
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4397
	btrfs_put_block_group(block_group);
4398 4399 4400
	return ret;
}

4401 4402
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4403
		      u64 bytenr, int level, u64 owner)
4404
{
4405
	struct btrfs_fs_info *fs_info = root->fs_info;
4406 4407
	struct extent_buffer *buf;

4408
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
4409 4410 4411
	if (IS_ERR(buf))
		return buf;

4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4425
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
4426
	btrfs_tree_lock(buf);
4427
	btrfs_clean_tree_block(buf);
4428
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4429

4430
	btrfs_set_lock_blocking_write(buf);
4431
	set_extent_buffer_uptodate(buf);
4432

4433 4434 4435 4436 4437 4438
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4439
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4440
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4441
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4442
		buf->log_index = root->log_transid % 2;
4443 4444
		/*
		 * we allow two log transactions at a time, use different
4445
		 * EXTENT bit to differentiate dirty pages.
4446
		 */
4447
		if (buf->log_index == 0)
4448 4449 4450 4451
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4452
					buf->start + buf->len - 1);
4453
	} else {
4454
		buf->log_index = -1;
4455
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4456
			 buf->start + buf->len - 1, GFP_NOFS);
4457
	}
4458
	trans->dirty = true;
4459
	/* this returns a buffer locked for blocking */
4460 4461 4462
	return buf;
}

4463
/*
4464
 * finds a free extent and does all the dirty work required for allocation
4465
 * returns the tree buffer or an ERR_PTR on error.
4466
 */
4467
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4468 4469 4470 4471 4472
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
					     u64 empty_size)
4473
{
4474
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4475
	struct btrfs_key ins;
4476
	struct btrfs_block_rsv *block_rsv;
4477
	struct extent_buffer *buf;
4478
	struct btrfs_delayed_extent_op *extent_op;
4479
	struct btrfs_ref generic_ref = { 0 };
4480 4481
	u64 flags = 0;
	int ret;
4482 4483
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4484

4485
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4486
	if (btrfs_is_testing(fs_info)) {
4487
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4488
					    level, root_objectid);
4489 4490 4491 4492
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4493
#endif
4494

4495
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4496 4497 4498
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4499
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4500
				   empty_size, hint, &ins, 0, 0);
4501 4502
	if (ret)
		goto out_unuse;
4503

4504 4505
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
				    root_objectid);
4506 4507 4508 4509
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4510 4511 4512 4513 4514 4515 4516 4517 4518

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4519
		extent_op = btrfs_alloc_delayed_extent_op();
4520 4521 4522 4523
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4524 4525 4526 4527 4528
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4529 4530 4531
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4532
		extent_op->level = level;
4533

4534 4535 4536 4537
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
		generic_ref.real_root = root->root_key.objectid;
		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4538
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4539
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
4540
						 extent_op, NULL, NULL);
4541 4542
		if (ret)
			goto out_free_delayed;
4543
	}
4544
	return buf;
4545 4546 4547 4548 4549 4550

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
4551
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4552
out_unuse:
4553
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4554
	return ERR_PTR(ret);
4555
}
4556

4557 4558 4559 4560
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
4561 4562
	struct btrfs_key drop_progress;
	int drop_level;
4563 4564 4565 4566 4567
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
4568 4569
	int reada_slot;
	int reada_count;
4570
	int restarted;
4571 4572 4573 4574 4575
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
4576 4577 4578 4579
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
4580
{
4581
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4582 4583 4584
	u64 bytenr;
	u64 generation;
	u64 refs;
4585
	u64 flags;
4586
	u32 nritems;
Y
Yan, Zheng 已提交
4587 4588
	struct btrfs_key key;
	struct extent_buffer *eb;
4589
	int ret;
Y
Yan, Zheng 已提交
4590 4591
	int slot;
	int nread = 0;
4592

Y
Yan, Zheng 已提交
4593 4594 4595 4596 4597 4598
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
4599
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
4600
	}
4601

Y
Yan, Zheng 已提交
4602 4603
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
4604

Y
Yan, Zheng 已提交
4605 4606 4607
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
4608

C
Chris Mason 已提交
4609
		cond_resched();
Y
Yan, Zheng 已提交
4610 4611
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
4612

Y
Yan, Zheng 已提交
4613 4614
		if (slot == path->slots[wc->level])
			goto reada;
4615

Y
Yan, Zheng 已提交
4616 4617
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
4618 4619
			continue;

4620
		/* We don't lock the tree block, it's OK to be racy here */
4621
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4622 4623
					       wc->level - 1, 1, &refs,
					       &flags);
4624 4625 4626
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
4627 4628
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
4629 4630 4631
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
4632

4633 4634 4635
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
4636 4637 4638 4639 4640 4641 4642 4643
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
4644 4645 4646 4647
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
4648
		}
Y
Yan, Zheng 已提交
4649
reada:
4650
		readahead_tree_block(fs_info, bytenr);
Y
Yan, Zheng 已提交
4651
		nread++;
C
Chris Mason 已提交
4652
	}
Y
Yan, Zheng 已提交
4653
	wc->reada_slot = slot;
C
Chris Mason 已提交
4654
}
4655

Y
Yan Zheng 已提交
4656
/*
L
Liu Bo 已提交
4657
 * helper to process tree block while walking down the tree.
4658 4659 4660 4661 4662
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
4663
 */
4664
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4665
				   struct btrfs_root *root,
4666
				   struct btrfs_path *path,
4667
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
4668
{
4669
	struct btrfs_fs_info *fs_info = root->fs_info;
4670 4671 4672
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
4673 4674
	int ret;

4675 4676 4677
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
4678

4679 4680 4681 4682
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
4683 4684 4685
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4686
		BUG_ON(!path->locks[level]);
4687
		ret = btrfs_lookup_extent_info(trans, fs_info,
4688
					       eb->start, level, 1,
4689 4690
					       &wc->refs[level],
					       &wc->flags[level]);
4691 4692 4693
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
4694 4695
		BUG_ON(wc->refs[level] == 0);
	}
4696

4697 4698 4699
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
4700

4701
		if (path->locks[level] && !wc->keep_locks) {
4702
			btrfs_tree_unlock_rw(eb, path->locks[level]);
4703 4704 4705 4706
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
4707

4708 4709 4710
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
4711
		ret = btrfs_inc_ref(trans, root, eb, 1);
4712
		BUG_ON(ret); /* -ENOMEM */
4713
		ret = btrfs_dec_ref(trans, root, eb, 0);
4714
		BUG_ON(ret); /* -ENOMEM */
4715
		ret = btrfs_set_disk_extent_flags(trans, eb->start,
4716 4717
						  eb->len, flag,
						  btrfs_header_level(eb), 0);
4718
		BUG_ON(ret); /* -ENOMEM */
4719 4720 4721 4722 4723 4724 4725 4726
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
4727
		btrfs_tree_unlock_rw(eb, path->locks[level]);
4728 4729 4730 4731 4732
		path->locks[level] = 0;
	}
	return 0;
}

4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
4760
/*
L
Liu Bo 已提交
4761
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
4776
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
4777
{
4778
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4779 4780 4781 4782
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
4783
	struct btrfs_key first_key;
4784
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
4785 4786 4787 4788
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
4789
	bool need_account = false;
Y
Yan, Zheng 已提交
4790 4791 4792 4793 4794 4795 4796 4797 4798

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
4799 4800
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4801
		return 1;
4802
	}
Y
Yan, Zheng 已提交
4803 4804

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4805 4806
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
4807

4808
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
4809
	if (!next) {
4810
		next = btrfs_find_create_tree_block(fs_info, bytenr);
4811 4812 4813
		if (IS_ERR(next))
			return PTR_ERR(next);

4814 4815
		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
					       level - 1);
Y
Yan, Zheng 已提交
4816 4817 4818
		reada = 1;
	}
	btrfs_tree_lock(next);
4819
	btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
4820

4821
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
4822 4823
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
4824 4825
	if (ret < 0)
		goto out_unlock;
4826

4827
	if (unlikely(wc->refs[level - 1] == 0)) {
4828
		btrfs_err(fs_info, "Missing references.");
4829 4830
		ret = -EIO;
		goto out_unlock;
4831
	}
4832
	*lookup_info = 0;
Y
Yan, Zheng 已提交
4833

4834
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
4835
		if (wc->refs[level - 1] > 1) {
4836
			need_account = true;
4837 4838 4839 4840
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
4854 4855 4856 4857
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
4858 4859
	}

4860
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
4861 4862 4863
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
4864
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4865 4866 4867 4868 4869
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
4870 4871
		next = read_tree_block(fs_info, bytenr, generation, level - 1,
				       &first_key);
4872 4873 4874
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
4875
			free_extent_buffer(next);
4876
			return -EIO;
4877
		}
Y
Yan, Zheng 已提交
4878
		btrfs_tree_lock(next);
4879
		btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
4880 4881 4882
	}

	level--;
4883 4884 4885 4886 4887 4888
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
4889 4890
	path->nodes[level] = next;
	path->slots[level] = 0;
4891
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
Y
Yan, Zheng 已提交
4892 4893 4894 4895 4896 4897 4898
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
4899 4900 4901 4902
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
4903
			ASSERT(root->root_key.objectid ==
4904
			       btrfs_header_owner(path->nodes[level]));
4905 4906 4907 4908 4909 4910 4911
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
4912 4913
			parent = 0;
		}
Y
Yan, Zheng 已提交
4914

4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

4932 4933 4934 4935 4936 4937 4938
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
4939
			ret = btrfs_qgroup_trace_subtree(trans, next,
4940
							 generation, level - 1);
4941
			if (ret) {
4942
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
4943 4944
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
4945 4946
			}
		}
4947 4948 4949 4950 4951 4952 4953 4954 4955 4956

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

4957 4958 4959 4960
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
		ret = btrfs_free_extent(trans, &ref);
4961 4962
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
4963
	}
4964
no_delete:
4965 4966 4967 4968
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
4969 4970
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
4971 4972

	return ret;
Y
Yan, Zheng 已提交
4973 4974
}

4975
/*
L
Liu Bo 已提交
4976
 * helper to process tree block while walking up the tree.
4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
4992
	struct btrfs_fs_info *fs_info = root->fs_info;
4993
	int ret;
4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5019
			btrfs_set_lock_blocking_write(eb);
5020
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5021

5022
			ret = btrfs_lookup_extent_info(trans, fs_info,
5023
						       eb->start, level, 1,
5024 5025
						       &wc->refs[level],
						       &wc->flags[level]);
5026 5027
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5028
				path->locks[level] = 0;
5029 5030
				return ret;
			}
5031 5032
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5033
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5034
				path->locks[level] = 0;
5035 5036
				return 1;
			}
Y
Yan Zheng 已提交
5037
		}
5038
	}
Y
Yan Zheng 已提交
5039

5040 5041
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5042

5043 5044 5045
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5046
				ret = btrfs_dec_ref(trans, root, eb, 1);
5047
			else
5048
				ret = btrfs_dec_ref(trans, root, eb, 0);
5049
			BUG_ON(ret); /* -ENOMEM */
5050 5051 5052 5053 5054
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5055
					     ret);
5056
				}
5057
			}
5058
		}
5059
		/* make block locked assertion in btrfs_clean_tree_block happy */
5060 5061 5062
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5063
			btrfs_set_lock_blocking_write(eb);
5064
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5065
		}
5066
		btrfs_clean_tree_block(eb);
5067 5068 5069 5070 5071
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5072 5073
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5074 5075 5076
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5077 5078 5079
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5080 5081
	}

5082
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5083 5084 5085
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5086
	return 0;
5087 5088 5089 5090 5091

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5092 5093 5094 5095 5096 5097 5098 5099
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5100
	int lookup_info = 1;
5101 5102 5103
	int ret;

	while (level >= 0) {
5104
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5105 5106 5107 5108 5109 5110
		if (ret > 0)
			break;

		if (level == 0)
			break;

5111 5112 5113 5114
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5115
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5116 5117 5118
		if (ret > 0) {
			path->slots[level]++;
			continue;
5119 5120
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5121
		level = wc->level;
Y
Yan Zheng 已提交
5122 5123 5124 5125
	}
	return 0;
}

C
Chris Mason 已提交
5126
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5127
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5128
				 struct btrfs_path *path,
5129
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5130
{
5131
	int level = wc->level;
C
Chris Mason 已提交
5132
	int ret;
5133

5134 5135 5136 5137 5138 5139
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5140 5141
			return 0;
		} else {
5142 5143 5144
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5145 5146
			if (ret < 0)
				return ret;
5147

5148
			if (path->locks[level]) {
5149 5150
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5151
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5152
			}
5153 5154 5155
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5156 5157 5158 5159 5160
		}
	}
	return 1;
}

C
Chris Mason 已提交
5161
/*
5162 5163 5164 5165 5166 5167 5168 5169 5170
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5171 5172
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5173
 */
5174
int btrfs_drop_snapshot(struct btrfs_root *root,
A
Arne Jansen 已提交
5175 5176
			 struct btrfs_block_rsv *block_rsv, int update_ref,
			 int for_reloc)
C
Chris Mason 已提交
5177
{
5178
	struct btrfs_fs_info *fs_info = root->fs_info;
5179
	struct btrfs_path *path;
5180
	struct btrfs_trans_handle *trans;
5181
	struct btrfs_root *tree_root = fs_info->tree_root;
5182
	struct btrfs_root_item *root_item = &root->root_item;
5183 5184 5185 5186 5187
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5188
	bool root_dropped = false;
C
Chris Mason 已提交
5189

5190
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5191

5192
	path = btrfs_alloc_path();
5193 5194 5195 5196
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5197

5198
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5199 5200
	if (!wc) {
		btrfs_free_path(path);
5201 5202
		err = -ENOMEM;
		goto out;
5203
	}
5204

5205
	trans = btrfs_start_transaction(tree_root, 0);
5206 5207 5208 5209
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5210

5211 5212 5213 5214
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5215 5216
	if (block_rsv)
		trans->block_rsv = block_rsv;
5217

5218 5219 5220 5221 5222 5223 5224 5225 5226
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5227
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5228
		level = btrfs_header_level(root->node);
5229
		path->nodes[level] = btrfs_lock_root_node(root);
5230
		btrfs_set_lock_blocking_write(path->nodes[level]);
5231
		path->slots[level] = 0;
5232
		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5233 5234
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5235 5236
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5237 5238 5239
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5240
		level = root_item->drop_level;
5241
		BUG_ON(level == 0);
5242
		path->lowest_level = level;
5243 5244 5245 5246
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5247
			goto out_end_trans;
5248
		}
Y
Yan, Zheng 已提交
5249
		WARN_ON(ret > 0);
5250

5251 5252 5253 5254
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5255
		btrfs_unlock_up_safe(path, 0);
5256 5257 5258 5259

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5260
			btrfs_set_lock_blocking_write(path->nodes[level]);
5261
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5262

5263
			ret = btrfs_lookup_extent_info(trans, fs_info,
5264
						path->nodes[level]->start,
5265
						level, 1, &wc->refs[level],
5266
						&wc->flags[level]);
5267 5268 5269 5270
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5271 5272 5273 5274 5275 5276
			BUG_ON(wc->refs[level] == 0);

			if (level == root_item->drop_level)
				break;

			btrfs_tree_unlock(path->nodes[level]);
5277
			path->locks[level] = 0;
5278 5279 5280
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5281
	}
5282

5283
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5284 5285 5286 5287 5288
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5289
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5290

C
Chris Mason 已提交
5291
	while (1) {
D
David Sterba 已提交
5292

5293 5294 5295
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5296
			break;
5297
		}
C
Chris Mason 已提交
5298

5299 5300 5301
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5302
			break;
5303 5304 5305 5306
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5307 5308
			break;
		}
5309 5310

		if (wc->stage == DROP_REFERENCE) {
5311 5312 5313 5314 5315 5316 5317 5318
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
		root_item->drop_level = wc->drop_level;
5319 5320

		BUG_ON(wc->level == 0);
5321
		if (btrfs_should_end_transaction(trans) ||
5322
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5323 5324 5325
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5326
			if (ret) {
5327
				btrfs_abort_transaction(trans, ret);
5328 5329 5330
				err = ret;
				goto out_end_trans;
			}
5331

5332
			btrfs_end_transaction_throttle(trans);
5333
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5334 5335
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5336 5337 5338 5339
				err = -EAGAIN;
				goto out_free;
			}

5340
			trans = btrfs_start_transaction(tree_root, 0);
5341 5342 5343 5344
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5345 5346
			if (block_rsv)
				trans->block_rsv = block_rsv;
5347
		}
C
Chris Mason 已提交
5348
	}
5349
	btrfs_release_path(path);
5350 5351
	if (err)
		goto out_end_trans;
5352

5353
	ret = btrfs_del_root(trans, &root->root_key);
5354
	if (ret) {
5355
		btrfs_abort_transaction(trans, ret);
5356
		err = ret;
5357 5358
		goto out_end_trans;
	}
5359

5360
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5361 5362
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5363
		if (ret < 0) {
5364
			btrfs_abort_transaction(trans, ret);
5365 5366 5367
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5368 5369 5370 5371 5372 5373 5374
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5375 5376 5377
		}
	}

5378
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5379
		btrfs_add_dropped_root(trans, root);
5380 5381 5382
	} else {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
5383
		btrfs_put_fs_root(root);
5384
	}
5385
	root_dropped = true;
5386
out_end_trans:
5387
	btrfs_end_transaction_throttle(trans);
5388
out_free:
5389
	kfree(wc);
5390
	btrfs_free_path(path);
5391
out:
5392 5393 5394 5395 5396 5397 5398
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5399
	if (!for_reloc && !root_dropped)
5400
		btrfs_add_dead_root(root);
5401
	if (err && err != -EAGAIN)
5402
		btrfs_handle_fs_error(fs_info, err, NULL);
5403
	return err;
C
Chris Mason 已提交
5404
}
C
Chris Mason 已提交
5405

5406 5407 5408 5409
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5410
 * only used by relocation code
5411
 */
Y
Yan Zheng 已提交
5412 5413 5414 5415 5416
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5417
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5418
	struct btrfs_path *path;
5419
	struct walk_control *wc;
Y
Yan Zheng 已提交
5420 5421 5422 5423 5424
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5425 5426
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5427
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5428 5429
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5430

5431
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5432 5433 5434 5435
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5436

5437
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
5438
	parent_level = btrfs_header_level(parent);
D
David Sterba 已提交
5439
	atomic_inc(&parent->refs);
Y
Yan Zheng 已提交
5440 5441 5442
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5443
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
5444 5445 5446
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5447
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5448 5449 5450 5451 5452 5453 5454 5455

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5456
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5457 5458

	while (1) {
5459 5460
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5461 5462
			ret = wret;
			break;
5463
		}
Y
Yan Zheng 已提交
5464

5465
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5466 5467 5468 5469 5470 5471
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5472
	kfree(wc);
Y
Yan Zheng 已提交
5473 5474 5475 5476
	btrfs_free_path(path);
	return ret;
}

5477 5478
/*
 * helper to account the unused space of all the readonly block group in the
5479
 * space_info. takes mirrors into account.
5480
 */
5481
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5482 5483 5484 5485 5486
{
	struct btrfs_block_group_cache *block_group;
	u64 free_bytes = 0;
	int factor;

5487
	/* It's df, we don't care if it's racy */
5488 5489 5490 5491 5492
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5493 5494 5495 5496 5497 5498 5499
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5500
		factor = btrfs_bg_type_to_factor(block_group->flags);
5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511
		free_bytes += (block_group->key.offset -
			       btrfs_block_group_used(&block_group->item)) *
			       factor;

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5512 5513
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5514
{
5515
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5516 5517
}

5518 5519 5520 5521 5522 5523 5524 5525 5526
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5527 5528
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5529 5530 5531 5532 5533
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5534 5535 5536
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5537
 */
5538
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5539
{
5540
	u64 start = SZ_1M, len = 0, end = 0;
5541 5542 5543 5544
	int ret;

	*trimmed = 0;

5545 5546 5547 5548
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5549
	/* Not writable = nothing to do. */
5550
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5551 5552 5553 5554 5555 5556 5557 5558 5559
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
5560
		struct btrfs_fs_info *fs_info = device->fs_info;
5561 5562 5563 5564
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
5565
			break;
5566

5567 5568 5569
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5570 5571 5572 5573

		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

5574 5575 5576 5577 5578 5579
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
5580

5581
		len = end - start + 1;
5582

5583 5584
		/* We didn't find any extents */
		if (!len) {
5585
			mutex_unlock(&fs_info->chunk_mutex);
5586
			ret = 0;
5587 5588 5589
			break;
		}

5590 5591 5592 5593 5594 5595
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

5615 5616 5617 5618 5619 5620 5621 5622 5623
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
5624
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5625 5626
{
	struct btrfs_block_group_cache *cache = NULL;
5627 5628
	struct btrfs_device *device;
	struct list_head *devices;
5629
	u64 group_trimmed;
5630
	u64 range_end = U64_MAX;
5631 5632 5633
	u64 start;
	u64 end;
	u64 trimmed = 0;
5634 5635 5636 5637
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
5638 5639
	int ret = 0;

5640 5641 5642 5643 5644 5645 5646 5647
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

5648
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5649
	for (; cache; cache = btrfs_next_block_group(cache)) {
5650
		if (cache->key.objectid >= range_end) {
5651 5652 5653 5654 5655
			btrfs_put_block_group(cache);
			break;
		}

		start = max(range->start, cache->key.objectid);
5656
		end = min(range_end, cache->key.objectid + cache->key.offset);
5657 5658

		if (end - start >= range->minlen) {
5659 5660
			if (!btrfs_block_group_cache_done(cache)) {
				ret = btrfs_cache_block_group(cache, 0);
5661
				if (ret) {
5662 5663 5664
					bg_failed++;
					bg_ret = ret;
					continue;
5665
				}
5666
				ret = btrfs_wait_block_group_cache_done(cache);
5667
				if (ret) {
5668 5669 5670
					bg_failed++;
					bg_ret = ret;
					continue;
5671
				}
5672 5673 5674 5675 5676 5677 5678 5679 5680
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
5681 5682 5683
				bg_failed++;
				bg_ret = ret;
				continue;
5684 5685 5686 5687
			}
		}
	}

5688 5689 5690 5691
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
5692
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5693 5694
	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
5695
		ret = btrfs_trim_free_extents(device, &group_trimmed);
5696 5697 5698
		if (ret) {
			dev_failed++;
			dev_ret = ret;
5699
			break;
5700
		}
5701 5702 5703

		trimmed += group_trimmed;
	}
5704
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5705

5706 5707 5708 5709
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
5710
	range->len = trimmed;
5711 5712 5713
	if (bg_ret)
		return bg_ret;
	return dev_ret;
5714
}
5715 5716

/*
5717
 * btrfs_{start,end}_write_no_snapshotting() are similar to
5718 5719 5720
 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
 * data into the page cache through nocow before the subvolume is snapshoted,
 * but flush the data into disk after the snapshot creation, or to prevent
5721
 * operations while snapshotting is ongoing and that cause the snapshot to be
5722
 * inconsistent (writes followed by expanding truncates for example).
5723
 */
5724
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
5725 5726
{
	percpu_counter_dec(&root->subv_writers->counter);
5727
	cond_wake_up(&root->subv_writers->wait);
5728 5729
}

5730
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
5731
{
5732
	if (atomic_read(&root->will_be_snapshotted))
5733 5734 5735 5736 5737 5738 5739
		return 0;

	percpu_counter_inc(&root->subv_writers->counter);
	/*
	 * Make sure counter is updated before we check for snapshot creation.
	 */
	smp_mb();
5740 5741
	if (atomic_read(&root->will_be_snapshotted)) {
		btrfs_end_write_no_snapshotting(root);
5742 5743 5744 5745
		return 0;
	}
	return 1;
}
5746 5747 5748 5749 5750 5751

void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
	while (true) {
		int ret;

5752
		ret = btrfs_start_write_no_snapshotting(root);
5753 5754
		if (ret)
			break;
5755 5756
		wait_var_event(&root->will_be_snapshotted,
			       !atomic_read(&root->will_be_snapshotted));
5757 5758
	}
}