extent-tree.c 162.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

Z
Zach Brown 已提交
6
#include <linux/sched.h>
7
#include <linux/sched/signal.h>
8
#include <linux/pagemap.h>
9
#include <linux/writeback.h>
10
#include <linux/blkdev.h>
11
#include <linux/sort.h>
12
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
13
#include <linux/kthread.h>
14
#include <linux/slab.h>
15
#include <linux/ratelimit.h>
16
#include <linux/percpu_counter.h>
17
#include <linux/lockdep.h>
18
#include <linux/crc32c.h>
19
#include "tree-log.h"
20 21
#include "disk-io.h"
#include "print-tree.h"
22
#include "volumes.h"
D
David Woodhouse 已提交
23
#include "raid56.h"
24
#include "locking.h"
25
#include "free-space-cache.h"
26
#include "free-space-tree.h"
27
#include "math.h"
28
#include "sysfs.h"
J
Josef Bacik 已提交
29
#include "qgroup.h"
J
Josef Bacik 已提交
30
#include "ref-verify.h"
31
#include "space-info.h"
32
#include "block-rsv.h"
33
#include "delalloc-space.h"
34
#include "block-group.h"
35

36 37
#undef SCRAMBLE_DELAYED_REFS

38

39
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
40 41 42 43
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extra_op);
44 45 46 47 48 49 50 51
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
52
				     struct btrfs_delayed_ref_node *node,
53
				     struct btrfs_delayed_extent_op *extent_op);
54 55
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
56

J
Josef Bacik 已提交
57 58 59 60 61
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
{
	return (cache->flags & bits) == bits;
}

62 63
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
			      u64 start, u64 num_bytes)
J
Josef Bacik 已提交
64
{
65
	u64 end = start + num_bytes - 1;
66
	set_extent_bits(&fs_info->freed_extents[0],
67
			start, end, EXTENT_UPTODATE);
68
	set_extent_bits(&fs_info->freed_extents[1],
69
			start, end, EXTENT_UPTODATE);
70 71
	return 0;
}
J
Josef Bacik 已提交
72

73
void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
74
{
75
	struct btrfs_fs_info *fs_info = cache->fs_info;
76
	u64 start, end;
J
Josef Bacik 已提交
77

78 79 80
	start = cache->key.objectid;
	end = start + cache->key.offset - 1;

81
	clear_extent_bits(&fs_info->freed_extents[0],
82
			  start, end, EXTENT_UPTODATE);
83
	clear_extent_bits(&fs_info->freed_extents[1],
84
			  start, end, EXTENT_UPTODATE);
J
Josef Bacik 已提交
85 86
}

87
static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
88
{
89 90
	if (ref->type == BTRFS_REF_METADATA) {
		if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
91
			return BTRFS_BLOCK_GROUP_SYSTEM;
92
		else
93
			return BTRFS_BLOCK_GROUP_METADATA;
94
	}
95 96 97 98 99 100 101 102 103
	return BTRFS_BLOCK_GROUP_DATA;
}

static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);

104
	space_info = btrfs_find_space_info(fs_info, flags);
105 106 107 108 109 110 111 112 113 114
	ASSERT(space_info);
	percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
}

static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_ref *ref)
{
	struct btrfs_space_info *space_info;
	u64 flags = generic_ref_to_space_flags(ref);
115

116
	space_info = btrfs_find_space_info(fs_info, flags);
117
	ASSERT(space_info);
118
	percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
119
		    BTRFS_TOTAL_BYTES_PINNED_BATCH);
120 121
}

122
/* simple helper to search for an existing data extent at a given offset */
123
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
124 125 126
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
127
	struct btrfs_path *path;
128

Z
Zheng Yan 已提交
129
	path = btrfs_alloc_path();
130 131 132
	if (!path)
		return -ENOMEM;

133 134
	key.objectid = start;
	key.offset = len;
135
	key.type = BTRFS_EXTENT_ITEM_KEY;
136
	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
Z
Zheng Yan 已提交
137
	btrfs_free_path(path);
138 139 140
	return ret;
}

141
/*
142
 * helper function to lookup reference count and flags of a tree block.
143 144 145 146 147 148 149 150
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
151
			     struct btrfs_fs_info *fs_info, u64 bytenr,
152
			     u64 offset, int metadata, u64 *refs, u64 *flags)
153 154 155 156 157 158 159 160 161 162 163 164
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

165 166 167 168
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
169 170
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
		offset = fs_info->nodesize;
171 172 173
		metadata = 0;
	}

174 175 176 177 178 179 180 181
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
182 183 184 185 186 187 188 189 190

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

191
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
192 193 194
	if (ret < 0)
		goto out_free;

195
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
196 197 198 199 200 201
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
202
			    key.offset == fs_info->nodesize)
203 204
				ret = 0;
		}
205 206
	}

207 208 209 210 211 212 213 214 215
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
216 217 218 219 220 221 222 223
			ret = -EINVAL;
			btrfs_print_v0_err(fs_info);
			if (trans)
				btrfs_abort_transaction(trans, ret);
			else
				btrfs_handle_fs_error(fs_info, ret, NULL);

			goto out_free;
224
		}
225

226 227 228 229 230 231 232 233 234 235 236 237
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
238
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
239 240
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
241
			refcount_inc(&head->refs);
242 243
			spin_unlock(&delayed_refs->lock);

244
			btrfs_release_path(path);
245

246 247 248 249
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
250 251
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
252
			btrfs_put_delayed_ref_head(head);
253
			goto search_again;
254
		}
255
		spin_lock(&head->lock);
256 257 258 259 260
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

261
		num_refs += head->ref_mod;
262
		spin_unlock(&head->lock);
263 264 265 266 267 268 269 270 271 272 273 274 275 276
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

277 278 279 280 281 282 283 284 285 286 287 288 289 290
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
309
 * When a tree block is COWed through a tree, there are four cases:
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
336 337 338
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
339
 * - different files inside a single subvolume
340 341
 * - different offsets inside a file (bookend extents in file.c)
 *
342
 * The extent ref structure for the implicit back refs has fields for:
343 344 345
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
346 347
 * - original offset in the file
 * - how many bookend extents
348
 *
349 350
 * The key offset for the implicit back refs is hash of the first
 * three fields.
351
 *
352
 * The extent ref structure for the full back refs has field for:
353
 *
354
 * - number of pointers in the tree leaf
355
 *
356 357
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
358
 *
359 360
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
361
 *
362
 *     (root_key.objectid, inode objectid, offset in file, 1)
363
 *
364 365
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
366
 *
367
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
368
 *
369
 * Btree extents can be referenced by:
370
 *
371
 * - Different subvolumes
372
 *
373 374 375 376
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
377
 *
378 379 380
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
381
 */
Z
Zheng Yan 已提交
382

383 384
/*
 * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
385
 * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
386 387 388 389 390 391 392
 * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
 */
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
				     struct btrfs_extent_inline_ref *iref,
				     enum btrfs_inline_ref_type is_data)
{
	int type = btrfs_extent_inline_ref_type(eb, iref);
393
	u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
394 395 396 397 398 399

	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_BLOCK_REF_KEY ||
	    type == BTRFS_SHARED_DATA_REF_KEY ||
	    type == BTRFS_EXTENT_DATA_REF_KEY) {
		if (is_data == BTRFS_REF_TYPE_BLOCK) {
400
			if (type == BTRFS_TREE_BLOCK_REF_KEY)
401
				return type;
402 403 404 405 406 407 408 409 410 411 412
			if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
413
		} else if (is_data == BTRFS_REF_TYPE_DATA) {
414
			if (type == BTRFS_EXTENT_DATA_REF_KEY)
415
				return type;
416 417 418 419 420 421 422 423 424 425 426
			if (type == BTRFS_SHARED_DATA_REF_KEY) {
				ASSERT(eb->fs_info);
				/*
				 * Every shared one has parent tree
				 * block, which must be aligned to
				 * nodesize.
				 */
				if (offset &&
				    IS_ALIGNED(offset, eb->fs_info->nodesize))
					return type;
			}
427 428 429 430 431 432 433 434 435 436 437 438 439 440
		} else {
			ASSERT(is_data == BTRFS_REF_TYPE_ANY);
			return type;
		}
	}

	btrfs_print_leaf((struct extent_buffer *)eb);
	btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
		  eb->start, type);
	WARN_ON(1);

	return BTRFS_REF_TYPE_INVALID;
}

441 442 443 444 445 446 447
static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
448
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
449
	lenum = cpu_to_le64(owner);
450
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
451
	lenum = cpu_to_le64(offset);
452
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
482
	struct btrfs_root *root = trans->fs_info->extent_root;
483 484
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
485
	struct extent_buffer *leaf;
486
	u32 nritems;
487
	int ret;
488 489
	int recow;
	int err = -ENOENT;
490

Z
Zheng Yan 已提交
491
	key.objectid = bytenr;
492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
507

508 509 510 511
	if (parent) {
		if (!ret)
			return 0;
		goto fail;
Z
Zheng Yan 已提交
512 513 514
	}

	leaf = path->nodes[0];
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
540
				btrfs_release_path(path);
541 542 543 544 545 546
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
547
	}
548 549
fail:
	return err;
Z
Zheng Yan 已提交
550 551
}

552 553 554 555 556
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
557
{
558
	struct btrfs_root *root = trans->fs_info->extent_root;
Z
Zheng Yan 已提交
559 560
	struct btrfs_key key;
	struct extent_buffer *leaf;
561
	u32 size;
Z
Zheng Yan 已提交
562 563
	u32 num_refs;
	int ret;
564 565

	key.objectid = bytenr;
566 567 568 569 570 571 572 573 574 575
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
576

577 578 579 580 581 582 583
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
584
		ref = btrfs_item_ptr(leaf, path->slots[0],
585 586 587 588 589 590 591
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
592
		}
593 594 595 596 597 598 599 600
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
601
			btrfs_release_path(path);
602 603 604 605 606
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
607

608 609 610 611 612 613 614 615 616 617 618 619 620 621
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
622 623
		}
	}
624 625 626
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
627
	btrfs_release_path(path);
628
	return ret;
629 630
}

631 632
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
633
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
634
{
635 636 637
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
638
	struct extent_buffer *leaf;
639
	u32 num_refs = 0;
Z
Zheng Yan 已提交
640 641 642
	int ret = 0;

	leaf = path->nodes[0];
643 644 645 646 647 648 649 650 651 652
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
653
	} else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
654 655 656
		btrfs_print_v0_err(trans->fs_info);
		btrfs_abort_transaction(trans, -EINVAL);
		return -EINVAL;
657 658 659 660
	} else {
		BUG();
	}

661 662
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
663

Z
Zheng Yan 已提交
664
	if (num_refs == 0) {
665
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
J
Josef Bacik 已提交
666
		*last_ref = 1;
Z
Zheng Yan 已提交
667
	} else {
668 669 670 671
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
Z
Zheng Yan 已提交
672 673 674 675 676
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

677
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
678
					  struct btrfs_extent_inline_ref *iref)
679
{
680 681 682 683 684
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;
685
	int type;
686 687 688

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
689 690

	BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
691
	if (iref) {
692 693 694 695 696 697 698
		/*
		 * If type is invalid, we should have bailed out earlier than
		 * this call.
		 */
		type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
		ASSERT(type != BTRFS_REF_TYPE_INVALID);
		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
718

719 720 721 722
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
723
{
724
	struct btrfs_root *root = trans->fs_info->extent_root;
725
	struct btrfs_key key;
726 727
	int ret;

728 729 730 731 732 733 734
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
735 736
	}

737 738 739 740
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
	return ret;
741 742
}

743 744 745 746
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
747
{
748
	struct btrfs_key key;
Z
Zheng Yan 已提交
749 750
	int ret;

751 752 753 754 755 756 757 758 759
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

760
	ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
761
				      path, &key, 0);
762
	btrfs_release_path(path);
Z
Zheng Yan 已提交
763 764 765
	return ret;
}

766
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
767
{
768 769 770 771 772 773 774 775 776 777 778 779 780
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
781
}
782

783 784
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
785

C
Chris Mason 已提交
786
{
787
	for (; level < BTRFS_MAX_LEVEL; level++) {
788 789 790 791 792 793 794 795 796 797 798 799 800 801 802
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
803

804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
825
	struct btrfs_fs_info *fs_info = trans->fs_info;
826
	struct btrfs_root *root = fs_info->extent_root;
827 828 829 830 831 832 833 834 835 836 837 838 839
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
840
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
841
	int needed;
842

843
	key.objectid = bytenr;
Z
Zheng Yan 已提交
844
	key.type = BTRFS_EXTENT_ITEM_KEY;
845
	key.offset = num_bytes;
Z
Zheng Yan 已提交
846

847 848 849
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
850
		path->keep_locks = 1;
851 852
	} else
		extra_size = -1;
853 854

	/*
855 856
	 * Owner is our level, so we can just add one to get the level for the
	 * block we are interested in.
857 858 859 860 861 862 863
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
864
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
865
	if (ret < 0) {
866 867 868
		err = ret;
		goto out;
	}
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
886
			key.objectid = bytenr;
887 888 889 890 891 892 893
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

894 895 896
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
897
	} else if (WARN_ON(ret)) {
898 899
		err = -EIO;
		goto out;
900
	}
901 902 903

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
904
	if (unlikely(item_size < sizeof(*ei))) {
905 906 907 908 909
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}
910 911 912 913 914 915 916

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

917
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
918 919 920 921
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

922 923 924 925 926
	if (owner >= BTRFS_FIRST_FREE_OBJECTID)
		needed = BTRFS_REF_TYPE_DATA;
	else
		needed = BTRFS_REF_TYPE_BLOCK;

927 928 929 930 931 932 933
	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
934 935
		type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
		if (type == BTRFS_REF_TYPE_INVALID) {
936
			err = -EUCLEAN;
937 938 939
			goto out;
		}

940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
991 992
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
993
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
994 995 996 997 998 999
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
1000
	if (insert) {
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010
		path->keep_locks = 0;
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
1011
void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1012 1013 1014 1015 1016
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1034
	btrfs_extend_item(path, size);
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

1079 1080 1081
	ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 0);
1082
	if (ret != -ENOENT)
1083
		return ret;
1084

1085
	btrfs_release_path(path);
1086 1087 1088
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1089 1090
		ret = lookup_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1091
	} else {
1092 1093
		ret = lookup_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset);
1094
	}
1095 1096
	return ret;
}
Z
Zheng Yan 已提交
1097

1098 1099 1100 1101
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1102
void update_inline_extent_backref(struct btrfs_path *path,
1103 1104
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1105 1106
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1107
{
1108
	struct extent_buffer *leaf = path->nodes[0];
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

1127 1128 1129 1130 1131 1132
	/*
	 * If type is invalid, we should have bailed out after
	 * lookup_inline_extent_backref().
	 */
	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
	ASSERT(type != BTRFS_REF_TYPE_INVALID);
1133 1134 1135 1136 1137 1138 1139 1140 1141 1142

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1143
	}
Z
Zheng Yan 已提交
1144

1145 1146 1147 1148 1149 1150 1151 1152 1153
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1154
		*last_ref = 1;
1155 1156 1157 1158 1159 1160 1161 1162
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1163
		btrfs_truncate_item(path, item_size, 1);
1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

1179 1180 1181
	ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
					   num_bytes, parent, root_objectid,
					   owner, offset, 1);
1182 1183
	if (ret == 0) {
		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1184 1185
		update_inline_extent_backref(path, iref, refs_to_add,
					     extent_op, NULL);
1186
	} else if (ret == -ENOENT) {
1187
		setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1188 1189 1190
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1191
	}
1192 1193
	return ret;
}
Z
Zheng Yan 已提交
1194

1195 1196 1197 1198 1199 1200 1201 1202
static int insert_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 u64 bytenr, u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add)
{
	int ret;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
1203 1204
		ret = insert_tree_block_ref(trans, path, bytenr, parent,
					    root_objectid);
1205
	} else {
1206 1207 1208
		ret = insert_extent_data_ref(trans, path, bytenr, parent,
					     root_objectid, owner, offset,
					     refs_to_add);
1209 1210 1211
	}
	return ret;
}
1212

1213 1214 1215
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1216
				 int refs_to_drop, int is_data, int *last_ref)
1217
{
1218
	int ret = 0;
1219

1220 1221
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1222 1223
		update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
					     last_ref);
1224
	} else if (is_data) {
1225
		ret = remove_extent_data_ref(trans, path, refs_to_drop,
J
Josef Bacik 已提交
1226
					     last_ref);
1227
	} else {
J
Josef Bacik 已提交
1228
		*last_ref = 1;
1229
		ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1230 1231 1232 1233
	}
	return ret;
}

1234 1235
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1236
{
1237 1238
	int j, ret = 0;
	u64 bytes_left, end;
1239
	u64 aligned_start = ALIGN(start, 1 << 9);
1240

1241 1242 1243 1244 1245
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1246

1247
	*discarded_bytes = 0;
1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1299 1300
					   GFP_NOFS, 0);
		if (!ret)
1301
			*discarded_bytes += bytes_left;
1302
	}
1303
	return ret;
1304 1305
}

1306
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1307
			 u64 num_bytes, u64 *actual_bytes)
1308 1309
{
	int ret;
1310
	u64 discarded_bytes = 0;
1311
	struct btrfs_bio *bbio = NULL;
1312

C
Christoph Hellwig 已提交
1313

1314 1315 1316 1317
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
1318
	btrfs_bio_counter_inc_blocked(fs_info);
1319
	/* Tell the block device(s) that the sectors can be discarded */
1320 1321
	ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
			      &bbio, 0);
1322
	/* Error condition is -ENOMEM */
1323
	if (!ret) {
1324
		struct btrfs_bio_stripe *stripe = bbio->stripes;
1325 1326 1327
		int i;


1328
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1329
			u64 bytes;
1330 1331
			struct request_queue *req_q;

1332 1333 1334 1335
			if (!stripe->dev->bdev) {
				ASSERT(btrfs_test_opt(fs_info, DEGRADED));
				continue;
			}
1336 1337
			req_q = bdev_get_queue(stripe->dev->bdev);
			if (!blk_queue_discard(req_q))
1338 1339
				continue;

1340 1341
			ret = btrfs_issue_discard(stripe->dev->bdev,
						  stripe->physical,
1342 1343
						  stripe->length,
						  &bytes);
1344
			if (!ret)
1345
				discarded_bytes += bytes;
1346
			else if (ret != -EOPNOTSUPP)
1347
				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1348 1349 1350 1351 1352 1353 1354

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
1355
		}
1356
		btrfs_put_bbio(bbio);
1357
	}
1358
	btrfs_bio_counter_dec(fs_info);
1359 1360 1361 1362

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

1363

D
David Woodhouse 已提交
1364 1365
	if (ret == -EOPNOTSUPP)
		ret = 0;
1366 1367 1368
	return ret;
}

1369
/* Can return -ENOMEM */
1370
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1371
			 struct btrfs_ref *generic_ref)
1372
{
1373
	struct btrfs_fs_info *fs_info = trans->fs_info;
1374
	int old_ref_mod, new_ref_mod;
1375
	int ret;
A
Arne Jansen 已提交
1376

1377 1378 1379 1380
	ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
	       generic_ref->action);
	BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
	       generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1381

1382 1383
	if (generic_ref->type == BTRFS_REF_METADATA)
		ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1384
				NULL, &old_ref_mod, &new_ref_mod);
1385 1386
	else
		ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1387 1388
						 &old_ref_mod, &new_ref_mod);

1389
	btrfs_ref_tree_mod(fs_info, generic_ref);
1390

1391
	if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1392
		sub_pinned_bytes(fs_info, generic_ref);
1393

1394 1395 1396
	return ret;
}

1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
/*
 * __btrfs_inc_extent_ref - insert backreference for a given extent
 *
 * @trans:	    Handle of transaction
 *
 * @node:	    The delayed ref node used to get the bytenr/length for
 *		    extent whose references are incremented.
 *
 * @parent:	    If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
 *		    BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
 *		    bytenr of the parent block. Since new extents are always
 *		    created with indirect references, this will only be the case
 *		    when relocating a shared extent. In that case, root_objectid
 *		    will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
 *		    be 0
 *
 * @root_objectid:  The id of the root where this modification has originated,
 *		    this can be either one of the well-known metadata trees or
 *		    the subvolume id which references this extent.
 *
 * @owner:	    For data extents it is the inode number of the owning file.
 *		    For metadata extents this parameter holds the level in the
 *		    tree of the extent.
 *
 * @offset:	    For metadata extents the offset is ignored and is currently
 *		    always passed as 0. For data extents it is the fileoffset
 *		    this extent belongs to.
 *
 * @refs_to_add     Number of references to add
 *
 * @extent_op       Pointer to a structure, holding information necessary when
 *                  updating a tree block's flags
 *
 */
1431
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1432
				  struct btrfs_delayed_ref_node *node,
1433 1434 1435 1436 1437 1438 1439
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
1440
	struct btrfs_key key;
1441 1442
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
1443 1444 1445 1446 1447 1448 1449
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1450
	path->reada = READA_FORWARD;
1451 1452
	path->leave_spinning = 1;
	/* this will setup the path even if it fails to insert the back ref */
1453 1454 1455
	ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
					   parent, root_objectid, owner,
					   offset, refs_to_add, extent_op);
1456
	if ((ret < 0 && ret != -EAGAIN) || !ret)
1457
		goto out;
J
Josef Bacik 已提交
1458 1459 1460 1461 1462 1463

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
1464
	leaf = path->nodes[0];
J
Josef Bacik 已提交
1465
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1466 1467 1468 1469 1470
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
1471

1472
	btrfs_mark_buffer_dirty(leaf);
1473
	btrfs_release_path(path);
1474

1475
	path->reada = READA_FORWARD;
1476
	path->leave_spinning = 1;
1477
	/* now insert the actual backref */
1478 1479
	ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
				    owner, offset, refs_to_add);
1480
	if (ret)
1481
		btrfs_abort_transaction(trans, ret);
1482
out:
1483
	btrfs_free_path(path);
1484
	return ret;
1485 1486
}

1487 1488 1489 1490
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1491
{
1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
1504
	trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1505

1506 1507
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1508
	ref_root = ref->root;
1509 1510

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1511
		if (extent_op)
1512
			flags |= extent_op->flags_to_set;
1513 1514 1515 1516
		ret = alloc_reserved_file_extent(trans, parent, ref_root,
						 flags, ref->objectid,
						 ref->offset, &ins,
						 node->ref_mod);
1517
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1518 1519 1520
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->objectid, ref->offset,
					     node->ref_mod, extent_op);
1521
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1522
		ret = __btrfs_free_extent(trans, node, parent,
1523 1524
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
1525
					  extent_op);
1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1551
				 struct btrfs_delayed_ref_head *head,
1552 1553
				 struct btrfs_delayed_extent_op *extent_op)
{
1554
	struct btrfs_fs_info *fs_info = trans->fs_info;
1555 1556 1557 1558 1559
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
1560
	int ret;
1561
	int err = 0;
1562
	int metadata = !extent_op->is_data;
1563

1564 1565 1566
	if (trans->aborted)
		return 0;

1567
	if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1568 1569
		metadata = 0;

1570 1571 1572 1573
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

1574
	key.objectid = head->bytenr;
1575

1576 1577
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
1578
		key.offset = extent_op->level;
1579 1580
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
1581
		key.offset = head->num_bytes;
1582 1583 1584
	}

again:
1585
	path->reada = READA_FORWARD;
1586
	path->leave_spinning = 1;
1587
	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1588 1589 1590 1591 1592
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
1593
		if (metadata) {
1594 1595 1596 1597
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
1598
				if (key.objectid == head->bytenr &&
1599
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
1600
				    key.offset == head->num_bytes)
1601 1602 1603 1604 1605
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
1606

1607 1608
				key.objectid = head->bytenr;
				key.offset = head->num_bytes;
1609 1610 1611 1612 1613 1614
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
1615
		}
1616 1617 1618 1619
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1620

1621
	if (unlikely(item_size < sizeof(*ei))) {
1622 1623 1624 1625 1626 1627
		err = -EINVAL;
		btrfs_print_v0_err(fs_info);
		btrfs_abort_transaction(trans, err);
		goto out;
	}

1628 1629
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
1630

1631 1632 1633 1634
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
1635 1636
}

1637 1638 1639 1640
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
1641 1642
{
	int ret = 0;
1643 1644 1645
	struct btrfs_delayed_tree_ref *ref;
	u64 parent = 0;
	u64 ref_root = 0;
1646

1647
	ref = btrfs_delayed_node_to_tree_ref(node);
1648
	trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1649

1650 1651
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
1652
	ref_root = ref->root;
1653

1654
	if (node->ref_mod != 1) {
1655
		btrfs_err(trans->fs_info,
1656 1657 1658 1659 1660
	"btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
			  node->bytenr, node->ref_mod, node->action, ref_root,
			  parent);
		return -EIO;
	}
1661
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1662
		BUG_ON(!extent_op || !extent_op->update_flags);
1663
		ret = alloc_reserved_tree_block(trans, node, extent_op);
1664
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
1665 1666
		ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
					     ref->level, 0, 1, extent_op);
1667
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
1668
		ret = __btrfs_free_extent(trans, node, parent, ref_root,
1669
					  ref->level, 0, 1, extent_op);
1670 1671 1672
	} else {
		BUG();
	}
1673 1674 1675 1676
	return ret;
}

/* helper function to actually process a single delayed ref entry */
1677 1678 1679 1680
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
1681
{
1682 1683
	int ret = 0;

1684 1685
	if (trans->aborted) {
		if (insert_reserved)
1686
			btrfs_pin_extent(trans->fs_info, node->bytenr,
1687
					 node->num_bytes, 1);
1688
		return 0;
1689
	}
1690

1691 1692
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1693
		ret = run_delayed_tree_ref(trans, node, extent_op,
1694 1695 1696
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
1697
		ret = run_delayed_data_ref(trans, node, extent_op,
1698 1699 1700
					   insert_reserved);
	else
		BUG();
1701 1702 1703
	if (ret && insert_reserved)
		btrfs_pin_extent(trans->fs_info, node->bytenr,
				 node->num_bytes, 1);
1704
	return ret;
1705 1706
}

1707
static inline struct btrfs_delayed_ref_node *
1708 1709
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
1710 1711
	struct btrfs_delayed_ref_node *ref;

1712
	if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1713
		return NULL;
1714

1715 1716 1717 1718 1719 1720
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
1721 1722 1723 1724
	if (!list_empty(&head->ref_add_list))
		return list_first_entry(&head->ref_add_list,
				struct btrfs_delayed_ref_node, add_list);

1725
	ref = rb_entry(rb_first_cached(&head->ref_tree),
1726
		       struct btrfs_delayed_ref_node, ref_node);
1727 1728
	ASSERT(list_empty(&ref->add_list));
	return ref;
1729 1730
}

1731 1732 1733 1734 1735 1736 1737 1738 1739 1740
static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
				      struct btrfs_delayed_ref_head *head)
{
	spin_lock(&delayed_refs->lock);
	head->processing = 0;
	delayed_refs->num_heads_ready++;
	spin_unlock(&delayed_refs->lock);
	btrfs_delayed_ref_unlock(head);
}

J
Josef Bacik 已提交
1741 1742
static struct btrfs_delayed_extent_op *cleanup_extent_op(
				struct btrfs_delayed_ref_head *head)
1743 1744 1745 1746
{
	struct btrfs_delayed_extent_op *extent_op = head->extent_op;

	if (!extent_op)
J
Josef Bacik 已提交
1747 1748
		return NULL;

1749
	if (head->must_insert_reserved) {
J
Josef Bacik 已提交
1750
		head->extent_op = NULL;
1751
		btrfs_free_delayed_extent_op(extent_op);
J
Josef Bacik 已提交
1752
		return NULL;
1753
	}
J
Josef Bacik 已提交
1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766
	return extent_op;
}

static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
				     struct btrfs_delayed_ref_head *head)
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

	extent_op = cleanup_extent_op(head);
	if (!extent_op)
		return 0;
	head->extent_op = NULL;
1767
	spin_unlock(&head->lock);
1768
	ret = run_delayed_extent_op(trans, head, extent_op);
1769 1770 1771 1772
	btrfs_free_delayed_extent_op(extent_op);
	return ret ? ret : 1;
}

1773 1774 1775
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
				  struct btrfs_delayed_ref_root *delayed_refs,
				  struct btrfs_delayed_ref_head *head)
1776
{
J
Josef Bacik 已提交
1777
	int nr_items = 1;	/* Dropping this ref head update. */
1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788

	if (head->total_ref_mod < 0) {
		struct btrfs_space_info *space_info;
		u64 flags;

		if (head->is_data)
			flags = BTRFS_BLOCK_GROUP_DATA;
		else if (head->is_system)
			flags = BTRFS_BLOCK_GROUP_SYSTEM;
		else
			flags = BTRFS_BLOCK_GROUP_METADATA;
1789
		space_info = btrfs_find_space_info(fs_info, flags);
1790 1791 1792 1793 1794
		ASSERT(space_info);
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
				   -head->num_bytes,
				   BTRFS_TOTAL_BYTES_PINNED_BATCH);

J
Josef Bacik 已提交
1795 1796 1797 1798 1799
		/*
		 * We had csum deletions accounted for in our delayed refs rsv,
		 * we need to drop the csum leaves for this update from our
		 * delayed_refs_rsv.
		 */
1800 1801 1802 1803
		if (head->is_data) {
			spin_lock(&delayed_refs->lock);
			delayed_refs->pending_csums -= head->num_bytes;
			spin_unlock(&delayed_refs->lock);
J
Josef Bacik 已提交
1804 1805
			nr_items += btrfs_csum_bytes_to_leaves(fs_info,
				head->num_bytes);
1806 1807 1808
		}
	}

J
Josef Bacik 已提交
1809
	btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1810 1811
}

1812 1813 1814
static int cleanup_ref_head(struct btrfs_trans_handle *trans,
			    struct btrfs_delayed_ref_head *head)
{
1815 1816

	struct btrfs_fs_info *fs_info = trans->fs_info;
1817 1818 1819 1820 1821
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

J
Josef Bacik 已提交
1822
	ret = run_and_cleanup_extent_op(trans, head);
1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837
	if (ret < 0) {
		unselect_delayed_ref_head(delayed_refs, head);
		btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
		return ret;
	} else if (ret) {
		return ret;
	}

	/*
	 * Need to drop our head ref lock and re-acquire the delayed ref lock
	 * and then re-check to make sure nobody got added.
	 */
	spin_unlock(&head->lock);
	spin_lock(&delayed_refs->lock);
	spin_lock(&head->lock);
1838
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1839 1840 1841 1842
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		return 1;
	}
1843
	btrfs_delete_ref_head(delayed_refs, head);
1844
	spin_unlock(&head->lock);
N
Nikolay Borisov 已提交
1845
	spin_unlock(&delayed_refs->lock);
1846 1847

	if (head->must_insert_reserved) {
1848 1849
		btrfs_pin_extent(fs_info, head->bytenr,
				 head->num_bytes, 1);
1850
		if (head->is_data) {
1851 1852
			ret = btrfs_del_csums(trans, fs_info, head->bytenr,
					      head->num_bytes);
1853 1854 1855
		}
	}

1856
	btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1857 1858

	trace_run_delayed_ref_head(fs_info, head, 0);
1859
	btrfs_delayed_ref_unlock(head);
1860
	btrfs_put_delayed_ref_head(head);
1861 1862 1863
	return 0;
}

1864 1865 1866 1867 1868 1869 1870 1871 1872
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
					struct btrfs_trans_handle *trans)
{
	struct btrfs_delayed_ref_root *delayed_refs =
		&trans->transaction->delayed_refs;
	struct btrfs_delayed_ref_head *head = NULL;
	int ret;

	spin_lock(&delayed_refs->lock);
1873
	head = btrfs_select_ref_head(delayed_refs);
1874 1875 1876 1877 1878 1879 1880 1881 1882
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return head;
	}

	/*
	 * Grab the lock that says we are going to process all the refs for
	 * this head
	 */
1883
	ret = btrfs_delayed_ref_lock(delayed_refs, head);
1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896
	spin_unlock(&delayed_refs->lock);

	/*
	 * We may have dropped the spin lock to get the head mutex lock, and
	 * that might have given someone else time to free the head.  If that's
	 * true, it has been removed from our list and we can move on.
	 */
	if (ret == -EAGAIN)
		head = ERR_PTR(-EAGAIN);

	return head;
}

1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
				    struct btrfs_delayed_ref_head *locked_ref,
				    unsigned long *run_refs)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_extent_op *extent_op;
	struct btrfs_delayed_ref_node *ref;
	int must_insert_reserved = 0;
	int ret;

	delayed_refs = &trans->transaction->delayed_refs;

1910 1911 1912
	lockdep_assert_held(&locked_ref->mutex);
	lockdep_assert_held(&locked_ref->lock);

1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
	while ((ref = select_delayed_ref(locked_ref))) {
		if (ref->seq &&
		    btrfs_check_delayed_seq(fs_info, ref->seq)) {
			spin_unlock(&locked_ref->lock);
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			return -EAGAIN;
		}

		(*run_refs)++;
		ref->in_tree = 0;
		rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
		RB_CLEAR_NODE(&ref->ref_node);
		if (!list_empty(&ref->add_list))
			list_del(&ref->add_list);
		/*
		 * When we play the delayed ref, also correct the ref_mod on
		 * head
		 */
		switch (ref->action) {
		case BTRFS_ADD_DELAYED_REF:
		case BTRFS_ADD_DELAYED_EXTENT:
			locked_ref->ref_mod -= ref->ref_mod;
			break;
		case BTRFS_DROP_DELAYED_REF:
			locked_ref->ref_mod += ref->ref_mod;
			break;
		default:
			WARN_ON(1);
		}
		atomic_dec(&delayed_refs->num_entries);

		/*
		 * Record the must_insert_reserved flag before we drop the
		 * spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;

		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;
		spin_unlock(&locked_ref->lock);

		ret = run_one_delayed_ref(trans, ref, extent_op,
					  must_insert_reserved);

		btrfs_free_delayed_extent_op(extent_op);
		if (ret) {
			unselect_delayed_ref_head(delayed_refs, locked_ref);
			btrfs_put_delayed_ref(ref);
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
				    ret);
			return ret;
		}

		btrfs_put_delayed_ref(ref);
		cond_resched();

		spin_lock(&locked_ref->lock);
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
	}

	return 0;
}

1977 1978 1979 1980
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
1981 1982
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     unsigned long nr)
1983
{
1984
	struct btrfs_fs_info *fs_info = trans->fs_info;
1985 1986
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
1987
	ktime_t start = ktime_get();
1988
	int ret;
1989
	unsigned long count = 0;
1990
	unsigned long actual_count = 0;
1991 1992

	delayed_refs = &trans->transaction->delayed_refs;
1993
	do {
1994
		if (!locked_ref) {
1995
			locked_ref = btrfs_obtain_ref_head(trans);
1996 1997 1998 1999 2000 2001
			if (IS_ERR_OR_NULL(locked_ref)) {
				if (PTR_ERR(locked_ref) == -EAGAIN) {
					continue;
				} else {
					break;
				}
2002
			}
2003
			count++;
2004
		}
2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2017
		spin_lock(&locked_ref->lock);
2018
		btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2019

2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032
		ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
						      &actual_count);
		if (ret < 0 && ret != -EAGAIN) {
			/*
			 * Error, btrfs_run_delayed_refs_for_head already
			 * unlocked everything so just bail out
			 */
			return ret;
		} else if (!ret) {
			/*
			 * Success, perform the usual cleanup of a processed
			 * head
			 */
2033
			ret = cleanup_ref_head(trans, locked_ref);
2034
			if (ret > 0 ) {
2035 2036
				/* We dropped our lock, we need to loop. */
				ret = 0;
2037
				continue;
2038 2039
			} else if (ret) {
				return ret;
2040
			}
2041
		}
2042

2043
		/*
2044 2045
		 * Either success case or btrfs_run_delayed_refs_for_head
		 * returned -EAGAIN, meaning we need to select another head
2046 2047
		 */

2048
		locked_ref = NULL;
2049
		cond_resched();
2050
	} while ((nr != -1 && count < nr) || locked_ref);
2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2067
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2068 2069
		spin_unlock(&delayed_refs->lock);
	}
2070
	return 0;
2071 2072
}

2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2116
static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2117 2118 2119 2120 2121
{
	u64 num_bytes;

	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
			     sizeof(struct btrfs_extent_inline_ref));
2122
	if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2123 2124 2125 2126
		num_bytes += heads * sizeof(struct btrfs_tree_block_info);

	/*
	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2127
	 * closer to what we're really going to want to use.
2128
	 */
2129
	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2130 2131
}

2132 2133 2134 2135
/*
 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 * would require to store the csums for that many bytes.
 */
2136
u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2137 2138 2139 2140 2141
{
	u64 csum_size;
	u64 num_csums_per_leaf;
	u64 num_csums;

2142
	csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2143
	num_csums_per_leaf = div64_u64(csum_size,
2144 2145
			(u64)btrfs_super_csum_size(fs_info->super_copy));
	num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2146 2147 2148 2149 2150
	num_csums += num_csums_per_leaf - 1;
	num_csums = div64_u64(num_csums, num_csums_per_leaf);
	return num_csums;
}

2151 2152 2153 2154 2155 2156
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2157 2158 2159
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2160 2161
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2162
			   unsigned long count)
2163
{
2164
	struct btrfs_fs_info *fs_info = trans->fs_info;
2165 2166
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2167
	struct btrfs_delayed_ref_head *head;
2168 2169 2170
	int ret;
	int run_all = count == (unsigned long)-1;

2171 2172 2173 2174
	/* We'll clean this up in btrfs_cleanup_transaction */
	if (trans->aborted)
		return 0;

2175
	if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2176 2177
		return 0;

2178
	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2179
	if (count == 0)
2180
		count = atomic_read(&delayed_refs->num_entries) * 2;
2181

2182
again:
2183 2184 2185
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2186
	ret = __btrfs_run_delayed_refs(trans, count);
2187
	if (ret < 0) {
2188
		btrfs_abort_transaction(trans, ret);
2189
		return ret;
2190
	}
2191

2192
	if (run_all) {
2193
		btrfs_create_pending_block_groups(trans);
2194

2195
		spin_lock(&delayed_refs->lock);
2196
		node = rb_first_cached(&delayed_refs->href_root);
2197 2198
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2199
			goto out;
2200
		}
2201 2202 2203 2204
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
		refcount_inc(&head->refs);
		spin_unlock(&delayed_refs->lock);
2205

2206 2207 2208
		/* Mutex was contended, block until it's released and retry. */
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2209

2210
		btrfs_put_delayed_ref_head(head);
2211
		cond_resched();
2212
		goto again;
2213
	}
2214
out:
2215 2216 2217
	return 0;
}

2218 2219
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				u64 bytenr, u64 num_bytes, u64 flags,
2220
				int level, int is_data)
2221 2222 2223 2224
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

2225
	extent_op = btrfs_alloc_delayed_extent_op();
2226 2227 2228 2229
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
2230 2231 2232
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
2233
	extent_op->level = level;
2234

2235
	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2236
	if (ret)
2237
		btrfs_free_delayed_extent_op(extent_op);
2238 2239 2240
	return ret;
}

2241
static noinline int check_delayed_ref(struct btrfs_root *root,
2242 2243 2244 2245 2246 2247 2248
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
2249
	struct btrfs_transaction *cur_trans;
2250
	struct rb_node *node;
2251 2252
	int ret = 0;

2253
	spin_lock(&root->fs_info->trans_lock);
2254
	cur_trans = root->fs_info->running_transaction;
2255 2256 2257
	if (cur_trans)
		refcount_inc(&cur_trans->use_count);
	spin_unlock(&root->fs_info->trans_lock);
2258 2259 2260 2261
	if (!cur_trans)
		return 0;

	delayed_refs = &cur_trans->delayed_refs;
2262
	spin_lock(&delayed_refs->lock);
2263
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2264 2265
	if (!head) {
		spin_unlock(&delayed_refs->lock);
2266
		btrfs_put_transaction(cur_trans);
2267 2268
		return 0;
	}
2269 2270

	if (!mutex_trylock(&head->mutex)) {
2271
		refcount_inc(&head->refs);
2272 2273
		spin_unlock(&delayed_refs->lock);

2274
		btrfs_release_path(path);
2275

2276 2277 2278 2279
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
2280 2281
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
2282
		btrfs_put_delayed_ref_head(head);
2283
		btrfs_put_transaction(cur_trans);
2284 2285
		return -EAGAIN;
	}
2286
	spin_unlock(&delayed_refs->lock);
2287

2288
	spin_lock(&head->lock);
2289 2290 2291 2292
	/*
	 * XXX: We should replace this with a proper search function in the
	 * future.
	 */
2293 2294
	for (node = rb_first_cached(&head->ref_tree); node;
	     node = rb_next(node)) {
2295
		ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2296 2297 2298 2299 2300
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
2301

2302
		data_ref = btrfs_delayed_node_to_data_ref(ref);
2303

2304 2305 2306 2307 2308 2309 2310 2311 2312 2313
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
2314
	}
2315
	spin_unlock(&head->lock);
2316
	mutex_unlock(&head->mutex);
2317
	btrfs_put_transaction(cur_trans);
2318 2319 2320
	return ret;
}

2321
static noinline int check_committed_ref(struct btrfs_root *root,
2322 2323
					struct btrfs_path *path,
					u64 objectid, u64 offset, u64 bytenr)
2324
{
2325 2326
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_root *extent_root = fs_info->extent_root;
2327
	struct extent_buffer *leaf;
2328 2329 2330
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
2331
	struct btrfs_key key;
2332
	u32 item_size;
2333
	int type;
2334
	int ret;
2335

2336
	key.objectid = bytenr;
Z
Zheng Yan 已提交
2337
	key.offset = (u64)-1;
2338
	key.type = BTRFS_EXTENT_ITEM_KEY;
2339 2340 2341 2342

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
2343
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
2344 2345 2346

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
2347
		goto out;
2348

Z
Zheng Yan 已提交
2349
	path->slots[0]--;
2350
	leaf = path->nodes[0];
2351
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2352

2353
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2354
		goto out;
2355

2356 2357 2358
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2359

2360 2361 2362
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
2363

2364 2365 2366 2367 2368
	if (btrfs_extent_generation(leaf, ei) <=
	    btrfs_root_last_snapshot(&root->root_item))
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2369 2370 2371

	type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
	if (type != BTRFS_EXTENT_DATA_REF_KEY)
2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

2388 2389
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
			  u64 bytenr)
2390 2391 2392 2393 2394 2395
{
	struct btrfs_path *path;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
2396
		return -ENOMEM;
2397 2398

	do {
2399
		ret = check_committed_ref(root, path, objectid,
2400 2401
					  offset, bytenr);
		if (ret && ret != -ENOENT)
2402
			goto out;
Y
Yan Zheng 已提交
2403

2404 2405
		ret = check_delayed_ref(root, path, objectid, offset, bytenr);
	} while (ret == -EAGAIN);
2406

2407
out:
Y
Yan Zheng 已提交
2408
	btrfs_free_path(path);
2409 2410
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
2411
	return ret;
2412
}
C
Chris Mason 已提交
2413

2414
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2415
			   struct btrfs_root *root,
2416
			   struct extent_buffer *buf,
2417
			   int full_backref, int inc)
Z
Zheng Yan 已提交
2418
{
2419
	struct btrfs_fs_info *fs_info = root->fs_info;
Z
Zheng Yan 已提交
2420
	u64 bytenr;
2421 2422
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
2423 2424 2425 2426
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
2427 2428
	struct btrfs_ref generic_ref = { 0 };
	bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
Z
Zheng Yan 已提交
2429
	int i;
2430
	int action;
Z
Zheng Yan 已提交
2431 2432
	int level;
	int ret = 0;
2433

2434
	if (btrfs_is_testing(fs_info))
2435
		return 0;
2436

Z
Zheng Yan 已提交
2437 2438 2439 2440
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

2441
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2442
		return 0;
Z
Zheng Yan 已提交
2443

2444 2445 2446 2447
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;
2448 2449 2450 2451
	if (inc)
		action = BTRFS_ADD_DELAYED_REF;
	else
		action = BTRFS_DROP_DELAYED_REF;
2452 2453

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
2454
		if (level == 0) {
2455
			btrfs_item_key_to_cpu(buf, &key, i);
2456
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
2457
				continue;
2458
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
2459 2460 2461 2462 2463 2464 2465
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
2466 2467 2468

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
2469 2470 2471 2472 2473 2474
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
					    key.offset);
			generic_ref.skip_qgroup = for_reloc;
2475
			if (inc)
2476
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2477
			else
2478
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2479 2480 2481
			if (ret)
				goto fail;
		} else {
2482
			bytenr = btrfs_node_blockptr(buf, i);
2483
			num_bytes = fs_info->nodesize;
2484 2485 2486 2487 2488
			btrfs_init_generic_ref(&generic_ref, action, bytenr,
					       num_bytes, parent);
			generic_ref.real_root = root->root_key.objectid;
			btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
			generic_ref.skip_qgroup = for_reloc;
2489
			if (inc)
2490
				ret = btrfs_inc_extent_ref(trans, &generic_ref);
2491
			else
2492
				ret = btrfs_free_extent(trans, &generic_ref);
Z
Zheng Yan 已提交
2493 2494 2495 2496 2497 2498
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
2499 2500 2501 2502
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2503
		  struct extent_buffer *buf, int full_backref)
2504
{
2505
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2506 2507 2508
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2509
		  struct extent_buffer *buf, int full_backref)
2510
{
2511
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
2512 2513
}

2514
int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2515 2516 2517 2518
{
	struct btrfs_block_group_cache *block_group;
	int readonly = 0;

2519
	block_group = btrfs_lookup_block_group(fs_info, bytenr);
2520 2521 2522
	if (!block_group || block_group->ro)
		readonly = 1;
	if (block_group)
2523
		btrfs_put_block_group(block_group);
2524 2525 2526
	return readonly;
}

2527 2528 2529
/*
 * returns target flags in extended format or 0 if restripe for this
 * chunk_type is not in progress
2530
 *
2531
 * should be called with balance_lock held
2532
 */
2533
u64 btrfs_get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
	u64 target = 0;

	if (!bctl)
		return 0;

	if (flags & BTRFS_BLOCK_GROUP_DATA &&
	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
	}

	return target;
}

2555 2556 2557
/*
 * @flags: available profiles in extended format (see ctree.h)
 *
2558 2559 2560
 * Returns reduced profile in chunk format.  If profile changing is in
 * progress (either running or paused) picks the target profile (if it's
 * already available), otherwise falls back to plain reducing.
2561
 */
2562
static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
2563
{
2564
	u64 num_devices = fs_info->fs_devices->rw_devices;
2565
	u64 target;
2566 2567
	u64 raid_type;
	u64 allowed = 0;
2568

2569 2570 2571 2572
	/*
	 * see if restripe for this chunk_type is in progress, if so
	 * try to reduce to the target profile
	 */
2573
	spin_lock(&fs_info->balance_lock);
2574
	target = btrfs_get_restripe_target(fs_info, flags);
2575 2576 2577
	if (target) {
		/* pick target profile only if it's already available */
		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
2578
			spin_unlock(&fs_info->balance_lock);
2579
			return extended_to_chunk(target);
2580 2581
		}
	}
2582
	spin_unlock(&fs_info->balance_lock);
2583

D
David Woodhouse 已提交
2584
	/* First, mask out the RAID levels which aren't possible */
2585 2586
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
2587
			allowed |= btrfs_raid_array[raid_type].bg_flag;
2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604
	}
	allowed &= flags;

	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
		allowed = BTRFS_BLOCK_GROUP_RAID6;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
		allowed = BTRFS_BLOCK_GROUP_RAID5;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
		allowed = BTRFS_BLOCK_GROUP_RAID10;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
		allowed = BTRFS_BLOCK_GROUP_RAID1;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
		allowed = BTRFS_BLOCK_GROUP_RAID0;

	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;

	return extended_to_chunk(flags | allowed);
2605 2606
}

2607
static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
J
Josef Bacik 已提交
2608
{
2609
	unsigned seq;
2610
	u64 flags;
2611 2612

	do {
2613
		flags = orig_flags;
2614
		seq = read_seqbegin(&fs_info->profiles_lock);
2615 2616

		if (flags & BTRFS_BLOCK_GROUP_DATA)
2617
			flags |= fs_info->avail_data_alloc_bits;
2618
		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2619
			flags |= fs_info->avail_system_alloc_bits;
2620
		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2621 2622
			flags |= fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
2623

2624
	return btrfs_reduce_alloc_profile(fs_info, flags);
J
Josef Bacik 已提交
2625 2626
}

2627 2628 2629 2630 2631
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
{
	return get_alloc_profile(fs_info, orig_flags);
}

2632
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
2633
{
2634
	struct btrfs_fs_info *fs_info = root->fs_info;
2635
	u64 flags;
D
David Woodhouse 已提交
2636
	u64 ret;
J
Josef Bacik 已提交
2637

2638 2639
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
2640
	else if (root == fs_info->chunk_root)
2641
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
2642
	else
2643
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
2644

2645
	ret = get_alloc_profile(fs_info, flags);
D
David Woodhouse 已提交
2646
	return ret;
J
Josef Bacik 已提交
2647
}
J
Josef Bacik 已提交
2648

2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663
u64 btrfs_data_alloc_profile(struct btrfs_fs_info *fs_info)
{
	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_DATA);
}

u64 btrfs_metadata_alloc_profile(struct btrfs_fs_info *fs_info)
{
	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_METADATA);
}

u64 btrfs_system_alloc_profile(struct btrfs_fs_info *fs_info)
{
	return get_alloc_profile(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
}

2664
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2665
{
J
Josef Bacik 已提交
2666
	struct btrfs_block_group_cache *cache;
2667
	u64 bytenr;
J
Josef Bacik 已提交
2668

2669 2670 2671
	spin_lock(&fs_info->block_group_cache_lock);
	bytenr = fs_info->first_logical_byte;
	spin_unlock(&fs_info->block_group_cache_lock);
2672 2673 2674 2675

	if (bytenr < (u64)-1)
		return bytenr;

2676
	cache = btrfs_lookup_first_block_group(fs_info, search_start);
J
Josef Bacik 已提交
2677
	if (!cache)
2678
		return 0;
J
Josef Bacik 已提交
2679

2680
	bytenr = cache->key.objectid;
2681
	btrfs_put_block_group(cache);
2682 2683

	return bytenr;
2684 2685
}

2686
static int pin_down_extent(struct btrfs_block_group_cache *cache,
2687
			   u64 bytenr, u64 num_bytes, int reserved)
2688
{
2689 2690
	struct btrfs_fs_info *fs_info = cache->fs_info;

2691 2692 2693
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
2694 2695
	btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
					     num_bytes);
2696 2697 2698 2699 2700 2701
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
2702

2703
	trace_btrfs_space_reservation(fs_info, "pinned",
J
Josef Bacik 已提交
2704
				      cache->space_info->flags, num_bytes, 1);
2705 2706
	percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
		    num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2707
	set_extent_dirty(fs_info->pinned_extents, bytenr,
2708 2709 2710
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
2711

2712 2713 2714
/*
 * this function must be called within transaction
 */
2715
int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2716 2717 2718
		     u64 bytenr, u64 num_bytes, int reserved)
{
	struct btrfs_block_group_cache *cache;
J
Josef Bacik 已提交
2719

2720
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2721
	BUG_ON(!cache); /* Logic error */
2722

2723
	pin_down_extent(cache, bytenr, num_bytes, reserved);
2724 2725

	btrfs_put_block_group(cache);
2726 2727 2728
	return 0;
}

2729
/*
2730 2731
 * this function must be called within transaction
 */
2732
int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2733 2734 2735
				    u64 bytenr, u64 num_bytes)
{
	struct btrfs_block_group_cache *cache;
2736
	int ret;
2737

2738
	cache = btrfs_lookup_block_group(fs_info, bytenr);
2739 2740
	if (!cache)
		return -EINVAL;
2741 2742 2743 2744 2745 2746 2747

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
2748
	btrfs_cache_block_group(cache, 1);
2749

2750
	pin_down_extent(cache, bytenr, num_bytes, 0);
2751 2752

	/* remove us from the free space cache (if we're there at all) */
2753
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2754
	btrfs_put_block_group(cache);
2755
	return ret;
2756 2757
}

2758 2759
static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
				   u64 start, u64 num_bytes)
2760 2761 2762 2763 2764
{
	int ret;
	struct btrfs_block_group_cache *block_group;
	struct btrfs_caching_control *caching_ctl;

2765
	block_group = btrfs_lookup_block_group(fs_info, start);
2766 2767 2768
	if (!block_group)
		return -EINVAL;

2769
	btrfs_cache_block_group(block_group, 0);
2770
	caching_ctl = btrfs_get_caching_control(block_group);
2771 2772 2773

	if (!caching_ctl) {
		/* Logic error */
2774
		BUG_ON(!btrfs_block_group_cache_done(block_group));
2775 2776 2777 2778 2779
		ret = btrfs_remove_free_space(block_group, start, num_bytes);
	} else {
		mutex_lock(&caching_ctl->mutex);

		if (start >= caching_ctl->progress) {
2780 2781
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794
		} else if (start + num_bytes <= caching_ctl->progress) {
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
		} else {
			num_bytes = caching_ctl->progress - start;
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
			if (ret)
				goto out_lock;

			num_bytes = (start + num_bytes) -
				caching_ctl->progress;
			start = caching_ctl->progress;
2795 2796
			ret = btrfs_add_excluded_extent(fs_info, start,
							num_bytes);
2797 2798 2799
		}
out_lock:
		mutex_unlock(&caching_ctl->mutex);
2800
		btrfs_put_caching_control(caching_ctl);
2801 2802 2803 2804 2805
	}
	btrfs_put_block_group(block_group);
	return ret;
}

2806
int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2807
{
2808
	struct btrfs_fs_info *fs_info = eb->fs_info;
2809 2810 2811 2812
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;
2813
	int ret = 0;
2814

2815
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2830 2831 2832
		ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
		if (ret)
			break;
2833 2834
	}

2835
	return ret;
2836 2837
}

2838 2839 2840 2841 2842 2843
static void
btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
{
	atomic_inc(&bg->reservations);
}

2844
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
2845
{
2846 2847 2848
	struct btrfs_caching_control *next;
	struct btrfs_caching_control *caching_ctl;
	struct btrfs_block_group_cache *cache;
2849

2850
	down_write(&fs_info->commit_root_sem);
2851

2852 2853 2854
	list_for_each_entry_safe(caching_ctl, next,
				 &fs_info->caching_block_groups, list) {
		cache = caching_ctl->block_group;
2855
		if (btrfs_block_group_cache_done(cache)) {
2856 2857
			cache->last_byte_to_unpin = (u64)-1;
			list_del_init(&caching_ctl->list);
2858
			btrfs_put_caching_control(caching_ctl);
2859
		} else {
2860
			cache->last_byte_to_unpin = caching_ctl->progress;
2861 2862
		}
	}
2863 2864 2865 2866 2867 2868

	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		fs_info->pinned_extents = &fs_info->freed_extents[1];
	else
		fs_info->pinned_extents = &fs_info->freed_extents[0];

2869
	up_write(&fs_info->commit_root_sem);
2870

2871
	btrfs_update_global_block_rsv(fs_info);
2872 2873
}

2874 2875 2876 2877 2878
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
2879 2880
fetch_cluster_info(struct btrfs_fs_info *fs_info,
		   struct btrfs_space_info *space_info, u64 *empty_cluster)
2881 2882 2883 2884 2885 2886 2887 2888
{
	struct btrfs_free_cluster *ret = NULL;

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2889
		ret = &fs_info->meta_alloc_cluster;
2890 2891 2892
		if (btrfs_test_opt(fs_info, SSD))
			*empty_cluster = SZ_2M;
		else
2893
			*empty_cluster = SZ_64K;
2894 2895 2896
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
		   btrfs_test_opt(fs_info, SSD_SPREAD)) {
		*empty_cluster = SZ_2M;
2897
		ret = &fs_info->data_alloc_cluster;
2898 2899 2900 2901 2902
	}

	return ret;
}

2903 2904
static int unpin_extent_range(struct btrfs_fs_info *fs_info,
			      u64 start, u64 end,
2905
			      const bool return_free_space)
C
Chris Mason 已提交
2906
{
2907
	struct btrfs_block_group_cache *cache = NULL;
2908 2909
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2910
	struct btrfs_free_cluster *cluster = NULL;
2911
	u64 len;
2912 2913
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
2914
	bool readonly;
C
Chris Mason 已提交
2915

2916
	while (start <= end) {
2917
		readonly = false;
2918 2919 2920 2921
		if (!cache ||
		    start >= cache->key.objectid + cache->key.offset) {
			if (cache)
				btrfs_put_block_group(cache);
2922
			total_unpinned = 0;
2923
			cache = btrfs_lookup_block_group(fs_info, start);
2924
			BUG_ON(!cache); /* Logic error */
2925

2926
			cluster = fetch_cluster_info(fs_info,
2927 2928 2929
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
2930 2931 2932 2933 2934 2935 2936
		}

		len = cache->key.objectid + cache->key.offset - start;
		len = min(len, end + 1 - start);

		if (start < cache->last_byte_to_unpin) {
			len = min(len, cache->last_byte_to_unpin - start);
2937 2938
			if (return_free_space)
				btrfs_add_free_space(cache, start, len);
2939 2940
		}

2941
		start += len;
2942
		total_unpinned += len;
2943
		space_info = cache->space_info;
2944

2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

2958
		spin_lock(&space_info->lock);
2959 2960
		spin_lock(&cache->lock);
		cache->pinned -= len;
2961
		btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
J
Josef Bacik 已提交
2962 2963 2964

		trace_btrfs_space_reservation(fs_info, "pinned",
					      space_info->flags, len, 0);
2965
		space_info->max_extent_size = 0;
2966 2967
		percpu_counter_add_batch(&space_info->total_bytes_pinned,
			    -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2968 2969 2970 2971
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
		}
2972
		spin_unlock(&cache->lock);
2973 2974 2975
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
2976

2977 2978
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
2979 2980 2981
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
2982 2983
				btrfs_space_info_update_bytes_may_use(fs_info,
						space_info, to_add);
2984 2985
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
2986 2987 2988 2989 2990
				trace_btrfs_space_reservation(fs_info,
							      "space_info",
							      space_info->flags,
							      to_add, 1);
				len -= to_add;
2991 2992
			}
			spin_unlock(&global_rsv->lock);
2993 2994
			/* Add to any tickets we may have */
			if (len)
2995 2996
				btrfs_space_info_add_new_bytes(fs_info,
						space_info, len);
2997 2998
		}
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
2999
	}
3000 3001 3002

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
3003 3004 3005
	return 0;
}

3006
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
3007
{
3008
	struct btrfs_fs_info *fs_info = trans->fs_info;
3009 3010
	struct btrfs_block_group_cache *block_group, *tmp;
	struct list_head *deleted_bgs;
3011
	struct extent_io_tree *unpin;
3012 3013
	u64 start;
	u64 end;
3014 3015
	int ret;

3016 3017 3018 3019 3020
	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		unpin = &fs_info->freed_extents[1];
	else
		unpin = &fs_info->freed_extents[0];

3021
	while (!trans->aborted) {
3022 3023
		struct extent_state *cached_state = NULL;

3024
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
3025
		ret = find_first_extent_bit(unpin, 0, &start, &end,
3026
					    EXTENT_DIRTY, &cached_state);
3027 3028
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
3029
			break;
3030
		}
3031

3032
		if (btrfs_test_opt(fs_info, DISCARD))
3033
			ret = btrfs_discard_extent(fs_info, start,
3034
						   end + 1 - start, NULL);
3035

3036
		clear_extent_dirty(unpin, start, end, &cached_state);
3037
		unpin_extent_range(fs_info, start, end, true);
3038
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
3039
		free_extent_state(cached_state);
3040
		cond_resched();
3041
	}
J
Josef Bacik 已提交
3042

3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
		if (!trans->aborted)
3054
			ret = btrfs_discard_extent(fs_info,
3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065
						   block_group->key.objectid,
						   block_group->key.offset,
						   &trimmed);

		list_del_init(&block_group->bg_list);
		btrfs_put_block_group_trimming(block_group);
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
3066
			   "discard failed while removing blockgroup: errno=%d %s",
3067 3068 3069 3070
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
3071 3072 3073
	return 0;
}

3074
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3075 3076 3077 3078
			       struct btrfs_delayed_ref_node *node, u64 parent,
			       u64 root_objectid, u64 owner_objectid,
			       u64 owner_offset, int refs_to_drop,
			       struct btrfs_delayed_extent_op *extent_op)
3079
{
3080
	struct btrfs_fs_info *info = trans->fs_info;
C
Chris Mason 已提交
3081
	struct btrfs_key key;
3082
	struct btrfs_path *path;
3083
	struct btrfs_root *extent_root = info->extent_root;
3084
	struct extent_buffer *leaf;
3085 3086
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
3087
	int ret;
3088
	int is_data;
3089 3090 3091
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
3092 3093
	u32 item_size;
	u64 refs;
3094 3095
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
3096
	int last_ref = 0;
3097
	bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
C
Chris Mason 已提交
3098

3099
	path = btrfs_alloc_path();
3100 3101
	if (!path)
		return -ENOMEM;
3102

3103
	path->reada = READA_FORWARD;
3104
	path->leave_spinning = 1;
3105 3106 3107 3108

	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
	BUG_ON(!is_data && refs_to_drop != 1);

3109
	if (is_data)
3110
		skinny_metadata = false;
3111

3112 3113
	ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
				    parent, root_objectid, owner_objectid,
3114
				    owner_offset);
3115
	if (ret == 0) {
3116
		extent_slot = path->slots[0];
3117 3118
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
3119
					      extent_slot);
3120
			if (key.objectid != bytenr)
3121
				break;
3122 3123
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
3124 3125 3126
				found_extent = 1;
				break;
			}
3127 3128 3129 3130 3131
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
3132 3133
			if (path->slots[0] - extent_slot > 5)
				break;
3134
			extent_slot--;
3135
		}
3136

Z
Zheng Yan 已提交
3137
		if (!found_extent) {
3138
			BUG_ON(iref);
3139
			ret = remove_extent_backref(trans, path, NULL,
3140
						    refs_to_drop,
J
Josef Bacik 已提交
3141
						    is_data, &last_ref);
3142
			if (ret) {
3143
				btrfs_abort_transaction(trans, ret);
3144 3145
				goto out;
			}
3146
			btrfs_release_path(path);
3147
			path->leave_spinning = 1;
3148 3149 3150 3151 3152

			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

3153 3154 3155 3156 3157
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
3158 3159
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
3176
				key.objectid = bytenr;
3177 3178 3179 3180 3181 3182 3183
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

3184
			if (ret) {
J
Jeff Mahoney 已提交
3185 3186 3187
				btrfs_err(info,
					  "umm, got %d back from search, was looking for %llu",
					  ret, bytenr);
3188
				if (ret > 0)
3189
					btrfs_print_leaf(path->nodes[0]);
3190
			}
3191
			if (ret < 0) {
3192
				btrfs_abort_transaction(trans, ret);
3193 3194
				goto out;
			}
Z
Zheng Yan 已提交
3195 3196
			extent_slot = path->slots[0];
		}
3197
	} else if (WARN_ON(ret == -ENOENT)) {
3198
		btrfs_print_leaf(path->nodes[0]);
3199 3200
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3201 3202
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
3203
		btrfs_abort_transaction(trans, ret);
3204
		goto out;
3205
	} else {
3206
		btrfs_abort_transaction(trans, ret);
3207
		goto out;
3208
	}
3209 3210

	leaf = path->nodes[0];
3211
	item_size = btrfs_item_size_nr(leaf, extent_slot);
3212
	if (unlikely(item_size < sizeof(*ei))) {
3213 3214 3215 3216 3217
		ret = -EINVAL;
		btrfs_print_v0_err(info);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3218
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
3219
			    struct btrfs_extent_item);
3220 3221
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
3222 3223 3224 3225 3226
		struct btrfs_tree_block_info *bi;
		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
3227

3228
	refs = btrfs_extent_refs(leaf, ei);
3229
	if (refs < refs_to_drop) {
J
Jeff Mahoney 已提交
3230 3231 3232
		btrfs_err(info,
			  "trying to drop %d refs but we only have %Lu for bytenr %Lu",
			  refs_to_drop, refs, bytenr);
3233
		ret = -EINVAL;
3234
		btrfs_abort_transaction(trans, ret);
3235 3236
		goto out;
	}
3237
	refs -= refs_to_drop;
3238

3239 3240 3241 3242 3243 3244
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
3245
		 */
3246 3247 3248 3249 3250 3251 3252
		if (iref) {
			BUG_ON(!found_extent);
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
3253 3254 3255
			ret = remove_extent_backref(trans, path, iref,
						    refs_to_drop, is_data,
						    &last_ref);
3256
			if (ret) {
3257
				btrfs_abort_transaction(trans, ret);
3258 3259
				goto out;
			}
3260
		}
3261 3262 3263
	} else {
		if (found_extent) {
			BUG_ON(is_data && refs_to_drop !=
3264
			       extent_data_ref_count(path, iref));
3265 3266 3267 3268 3269 3270 3271
			if (iref) {
				BUG_ON(path->slots[0] != extent_slot);
			} else {
				BUG_ON(path->slots[0] != extent_slot + 1);
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
3272
		}
3273

J
Josef Bacik 已提交
3274
		last_ref = 1;
3275 3276
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
3277
		if (ret) {
3278
			btrfs_abort_transaction(trans, ret);
3279 3280
			goto out;
		}
3281
		btrfs_release_path(path);
3282

3283
		if (is_data) {
3284
			ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
3285
			if (ret) {
3286
				btrfs_abort_transaction(trans, ret);
3287 3288
				goto out;
			}
3289 3290
		}

3291
		ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3292
		if (ret) {
3293
			btrfs_abort_transaction(trans, ret);
3294 3295 3296
			goto out;
		}

3297
		ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3298
		if (ret) {
3299
			btrfs_abort_transaction(trans, ret);
3300 3301
			goto out;
		}
3302
	}
J
Josef Bacik 已提交
3303 3304
	btrfs_release_path(path);

3305
out:
3306
	btrfs_free_path(path);
3307 3308 3309
	return ret;
}

3310
/*
3311
 * when we free an block, it is possible (and likely) that we free the last
3312 3313 3314 3315 3316
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3317
				      u64 bytenr)
3318 3319 3320
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
3321
	int ret = 0;
3322 3323 3324

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
3325
	head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3326
	if (!head)
3327
		goto out_delayed_unlock;
3328

3329
	spin_lock(&head->lock);
3330
	if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3331 3332
		goto out;

J
Josef Bacik 已提交
3333 3334
	if (cleanup_extent_op(head) != NULL)
		goto out;
3335

3336 3337 3338 3339 3340 3341 3342
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

3343
	btrfs_delete_ref_head(delayed_refs, head);
3344
	head->processing = 0;
3345

3346
	spin_unlock(&head->lock);
3347 3348
	spin_unlock(&delayed_refs->lock);

3349 3350 3351 3352
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

3353
	btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3354
	mutex_unlock(&head->mutex);
3355
	btrfs_put_delayed_ref_head(head);
3356
	return ret;
3357
out:
3358
	spin_unlock(&head->lock);
3359 3360

out_delayed_unlock:
3361 3362 3363 3364
	spin_unlock(&delayed_refs->lock);
	return 0;
}

3365 3366 3367
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
3368
			   u64 parent, int last_ref)
3369
{
3370
	struct btrfs_fs_info *fs_info = root->fs_info;
3371
	struct btrfs_ref generic_ref = { 0 };
3372
	int pin = 1;
3373 3374
	int ret;

3375 3376 3377 3378 3379
	btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
			       buf->start, buf->len, parent);
	btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
			    root->root_key.objectid);

3380
	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3381 3382
		int old_ref_mod, new_ref_mod;

3383
		btrfs_ref_tree_mod(fs_info, &generic_ref);
3384
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
3385
						 &old_ref_mod, &new_ref_mod);
3386
		BUG_ON(ret); /* -ENOMEM */
3387
		pin = old_ref_mod >= 0 && new_ref_mod < 0;
3388 3389
	}

3390
	if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3391 3392
		struct btrfs_block_group_cache *cache;

3393
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3394
			ret = check_ref_cleanup(trans, buf->start);
3395
			if (!ret)
3396
				goto out;
3397 3398
		}

3399
		pin = 0;
3400
		cache = btrfs_lookup_block_group(fs_info, buf->start);
3401

3402
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3403
			pin_down_extent(cache, buf->start, buf->len, 1);
3404
			btrfs_put_block_group(cache);
3405
			goto out;
3406 3407 3408 3409 3410
		}

		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
3411
		btrfs_free_reserved_bytes(cache, buf->len, 0);
3412
		btrfs_put_block_group(cache);
3413
		trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3414 3415
	}
out:
3416
	if (pin)
3417
		add_pinned_bytes(fs_info, &generic_ref);
3418

3419 3420 3421 3422 3423 3424 3425
	if (last_ref) {
		/*
		 * Deleting the buffer, clear the corrupt flag since it doesn't
		 * matter anymore.
		 */
		clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
	}
3426 3427
}

3428
/* Can return -ENOMEM */
3429
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3430
{
3431
	struct btrfs_fs_info *fs_info = trans->fs_info;
3432
	int old_ref_mod, new_ref_mod;
3433 3434
	int ret;

3435
	if (btrfs_is_testing(fs_info))
3436
		return 0;
3437

3438 3439 3440 3441
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
3442 3443 3444 3445
	if ((ref->type == BTRFS_REF_METADATA &&
	     ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	    (ref->type == BTRFS_REF_DATA &&
	     ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3446
		/* unlocks the pinned mutex */
3447
		btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
3448
		old_ref_mod = new_ref_mod = 0;
3449
		ret = 0;
3450 3451
	} else if (ref->type == BTRFS_REF_METADATA) {
		ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
3452
						 &old_ref_mod, &new_ref_mod);
3453
	} else {
3454
		ret = btrfs_add_delayed_data_ref(trans, ref, 0,
3455
						 &old_ref_mod, &new_ref_mod);
3456
	}
3457

3458 3459 3460 3461 3462
	if (!((ref->type == BTRFS_REF_METADATA &&
	       ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
	      (ref->type == BTRFS_REF_DATA &&
	       ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
		btrfs_ref_tree_mod(fs_info, ref);
3463

3464
	if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
3465
		add_pinned_bytes(fs_info, ref);
3466

3467 3468 3469
	return ret;
}

J
Josef Bacik 已提交
3470
enum btrfs_loop_type {
3471 3472 3473 3474
	LOOP_CACHING_NOWAIT,
	LOOP_CACHING_WAIT,
	LOOP_ALLOC_CHUNK,
	LOOP_NO_EMPTY_SIZE,
J
Josef Bacik 已提交
3475 3476
};

3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498
static inline void
btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static inline void
btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static struct btrfs_block_group_cache *
btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
{
S
Sudip Mukherjee 已提交
3499
	struct btrfs_block_group_cache *used_bg = NULL;
3500

3501
	spin_lock(&cluster->refill_lock);
3502 3503 3504 3505 3506 3507
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
3508 3509
			return used_bg;

3510
		btrfs_get_block_group(used_bg);
3511

3512 3513
		if (!delalloc)
			return used_bg;
3514

3515 3516
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
3517

3518
		spin_unlock(&cluster->refill_lock);
3519

3520 3521
		/* We should only have one-level nested. */
		down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3522

3523 3524 3525
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
3526

3527 3528 3529
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540
}

static inline void
btrfs_release_block_group(struct btrfs_block_group_cache *cache,
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564
/*
 * Structure used internally for find_free_extent() function.  Wraps needed
 * parameters.
 */
struct find_free_extent_ctl {
	/* Basic allocation info */
	u64 ram_bytes;
	u64 num_bytes;
	u64 empty_size;
	u64 flags;
	int delalloc;

	/* Where to start the search inside the bg */
	u64 search_start;

	/* For clustered allocation */
	u64 empty_cluster;

	bool have_caching_bg;
	bool orig_have_caching_bg;

	/* RAID index, converted from flags */
	int index;

3565 3566 3567
	/*
	 * Current loop number, check find_free_extent_update_loop() for details
	 */
3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594
	int loop;

	/*
	 * Whether we're refilling a cluster, if true we need to re-search
	 * current block group but don't try to refill the cluster again.
	 */
	bool retry_clustered;

	/*
	 * Whether we're updating free space cache, if true we need to re-search
	 * current block group but don't try updating free space cache again.
	 */
	bool retry_unclustered;

	/* If current block group is cached */
	int cached;

	/* Max contiguous hole found */
	u64 max_extent_size;

	/* Total free space from free space cache, not always contiguous */
	u64 total_free_space;

	/* Found result */
	u64 found_offset;
};

3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667

/*
 * Helper function for find_free_extent().
 *
 * Return -ENOENT to inform caller that we need fallback to unclustered mode.
 * Return -EAGAIN to inform caller that we need to re-search this block group
 * Return >0 to inform caller that we find nothing
 * Return 0 means we have found a location and set ffe_ctl->found_offset.
 */
static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl,
		struct btrfs_block_group_cache **cluster_bg_ret)
{
	struct btrfs_block_group_cache *cluster_bg;
	u64 aligned_cluster;
	u64 offset;
	int ret;

	cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
	if (!cluster_bg)
		goto refill_cluster;
	if (cluster_bg != bg && (cluster_bg->ro ||
	    !block_group_bits(cluster_bg, ffe_ctl->flags)))
		goto release_cluster;

	offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
			ffe_ctl->num_bytes, cluster_bg->key.objectid,
			&ffe_ctl->max_extent_size);
	if (offset) {
		/* We have a block, we're done */
		spin_unlock(&last_ptr->refill_lock);
		trace_btrfs_reserve_extent_cluster(cluster_bg,
				ffe_ctl->search_start, ffe_ctl->num_bytes);
		*cluster_bg_ret = cluster_bg;
		ffe_ctl->found_offset = offset;
		return 0;
	}
	WARN_ON(last_ptr->block_group != cluster_bg);

release_cluster:
	/*
	 * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
	 * lets just skip it and let the allocator find whatever block it can
	 * find. If we reach this point, we will have tried the cluster
	 * allocator plenty of times and not have found anything, so we are
	 * likely way too fragmented for the clustering stuff to find anything.
	 *
	 * However, if the cluster is taken from the current block group,
	 * release the cluster first, so that we stand a better chance of
	 * succeeding in the unclustered allocation.
	 */
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
		spin_unlock(&last_ptr->refill_lock);
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
		return -ENOENT;
	}

	/* This cluster didn't work out, free it and start over */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);

	if (cluster_bg != bg)
		btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);

refill_cluster:
	if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
		spin_unlock(&last_ptr->refill_lock);
		return -ENOENT;
	}

	aligned_cluster = max_t(u64,
			ffe_ctl->empty_cluster + ffe_ctl->empty_size,
			bg->full_stripe_len);
3668 3669
	ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
			ffe_ctl->num_bytes, aligned_cluster);
3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688
	if (ret == 0) {
		/* Now pull our allocation out of this cluster */
		offset = btrfs_alloc_from_cluster(bg, last_ptr,
				ffe_ctl->num_bytes, ffe_ctl->search_start,
				&ffe_ctl->max_extent_size);
		if (offset) {
			/* We found one, proceed */
			spin_unlock(&last_ptr->refill_lock);
			trace_btrfs_reserve_extent_cluster(bg,
					ffe_ctl->search_start,
					ffe_ctl->num_bytes);
			ffe_ctl->found_offset = offset;
			return 0;
		}
	} else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
		   !ffe_ctl->retry_clustered) {
		spin_unlock(&last_ptr->refill_lock);

		ffe_ctl->retry_clustered = true;
3689
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702
				ffe_ctl->empty_cluster + ffe_ctl->empty_size);
		return -EAGAIN;
	}
	/*
	 * At this point we either didn't find a cluster or we weren't able to
	 * allocate a block from our cluster.  Free the cluster we've been
	 * trying to use, and go to the next block group.
	 */
	btrfs_return_cluster_to_free_space(NULL, last_ptr);
	spin_unlock(&last_ptr->refill_lock);
	return 1;
}

3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755
/*
 * Return >0 to inform caller that we find nothing
 * Return 0 when we found an free extent and set ffe_ctrl->found_offset
 * Return -EAGAIN to inform caller that we need to re-search this block group
 */
static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
		struct btrfs_free_cluster *last_ptr,
		struct find_free_extent_ctl *ffe_ctl)
{
	u64 offset;

	/*
	 * We are doing an unclustered allocation, set the fragmented flag so
	 * we don't bother trying to setup a cluster again until we get more
	 * space.
	 */
	if (unlikely(last_ptr)) {
		spin_lock(&last_ptr->lock);
		last_ptr->fragmented = 1;
		spin_unlock(&last_ptr->lock);
	}
	if (ffe_ctl->cached) {
		struct btrfs_free_space_ctl *free_space_ctl;

		free_space_ctl = bg->free_space_ctl;
		spin_lock(&free_space_ctl->tree_lock);
		if (free_space_ctl->free_space <
		    ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
		    ffe_ctl->empty_size) {
			ffe_ctl->total_free_space = max_t(u64,
					ffe_ctl->total_free_space,
					free_space_ctl->free_space);
			spin_unlock(&free_space_ctl->tree_lock);
			return 1;
		}
		spin_unlock(&free_space_ctl->tree_lock);
	}

	offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
			ffe_ctl->num_bytes, ffe_ctl->empty_size,
			&ffe_ctl->max_extent_size);

	/*
	 * If we didn't find a chunk, and we haven't failed on this block group
	 * before, and this block group is in the middle of caching and we are
	 * ok with waiting, then go ahead and wait for progress to be made, and
	 * set @retry_unclustered to true.
	 *
	 * If @retry_unclustered is true then we've already waited on this
	 * block group once and should move on to the next block group.
	 */
	if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
	    ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3756 3757
		btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
						      ffe_ctl->empty_size);
3758 3759 3760 3761 3762 3763 3764 3765 3766
		ffe_ctl->retry_unclustered = true;
		return -EAGAIN;
	} else if (!offset) {
		return 1;
	}
	ffe_ctl->found_offset = offset;
	return 0;
}

3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839
/*
 * Return >0 means caller needs to re-search for free extent
 * Return 0 means we have the needed free extent.
 * Return <0 means we failed to locate any free extent.
 */
static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
					struct btrfs_free_cluster *last_ptr,
					struct btrfs_key *ins,
					struct find_free_extent_ctl *ffe_ctl,
					int full_search, bool use_cluster)
{
	struct btrfs_root *root = fs_info->extent_root;
	int ret;

	if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
	    ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
		ffe_ctl->orig_have_caching_bg = true;

	if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
	    ffe_ctl->have_caching_bg)
		return 1;

	if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
		return 1;

	if (ins->objectid) {
		if (!use_cluster && last_ptr) {
			spin_lock(&last_ptr->lock);
			last_ptr->window_start = ins->objectid;
			spin_unlock(&last_ptr->lock);
		}
		return 0;
	}

	/*
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *		       again
	 */
	if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
		ffe_ctl->index = 0;
		if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
			 * don't have any uncached bgs and we've already done a
			 * full search through.
			 */
			if (ffe_ctl->orig_have_caching_bg || !full_search)
				ffe_ctl->loop = LOOP_CACHING_WAIT;
			else
				ffe_ctl->loop = LOOP_ALLOC_CHUNK;
		} else {
			ffe_ctl->loop++;
		}

		if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
			struct btrfs_trans_handle *trans;
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				return ret;
			}

3840 3841
			ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
						CHUNK_ALLOC_FORCE);
3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877

			/*
			 * If we can't allocate a new chunk we've already looped
			 * through at least once, move on to the NO_EMPTY_SIZE
			 * case.
			 */
			if (ret == -ENOSPC)
				ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;

			/* Do not bail out on ENOSPC since we can do more. */
			if (ret < 0 && ret != -ENOSPC)
				btrfs_abort_transaction(trans, ret);
			else
				ret = 0;
			if (!exist)
				btrfs_end_transaction(trans);
			if (ret)
				return ret;
		}

		if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (ffe_ctl->empty_size == 0 &&
			    ffe_ctl->empty_cluster == 0)
				return -ENOSPC;
			ffe_ctl->empty_size = 0;
			ffe_ctl->empty_cluster = 0;
		}
		return 1;
	}
	return -ENOSPC;
}

3878 3879 3880
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
3881
 * ins->objectid == start position
3882
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
3883
 * ins->offset == the size of the hole.
3884
 * Any available blocks before search_start are skipped.
3885 3886 3887
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901
 *
 * The overall logic and call chain:
 *
 * find_free_extent()
 * |- Iterate through all block groups
 * |  |- Get a valid block group
 * |  |- Try to do clustered allocation in that block group
 * |  |- Try to do unclustered allocation in that block group
 * |  |- Check if the result is valid
 * |  |  |- If valid, then exit
 * |  |- Jump to next block group
 * |
 * |- Push harder to find free extents
 *    |- If not found, re-iterate all block groups
3902
 */
3903
static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
3904 3905 3906
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
				u64 hint_byte, struct btrfs_key *ins,
				u64 flags, int delalloc)
3907
{
3908
	int ret = 0;
3909
	struct btrfs_free_cluster *last_ptr = NULL;
3910
	struct btrfs_block_group_cache *block_group = NULL;
3911
	struct find_free_extent_ctl ffe_ctl = {0};
3912
	struct btrfs_space_info *space_info;
3913
	bool use_cluster = true;
3914
	bool full_search = false;
3915

3916
	WARN_ON(num_bytes < fs_info->sectorsize);
3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930

	ffe_ctl.ram_bytes = ram_bytes;
	ffe_ctl.num_bytes = num_bytes;
	ffe_ctl.empty_size = empty_size;
	ffe_ctl.flags = flags;
	ffe_ctl.search_start = 0;
	ffe_ctl.retry_clustered = false;
	ffe_ctl.retry_unclustered = false;
	ffe_ctl.delalloc = delalloc;
	ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
	ffe_ctl.have_caching_bg = false;
	ffe_ctl.orig_have_caching_bg = false;
	ffe_ctl.found_offset = 0;

3931
	ins->type = BTRFS_EXTENT_ITEM_KEY;
3932 3933
	ins->objectid = 0;
	ins->offset = 0;
3934

3935
	trace_find_free_extent(fs_info, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
3936

3937
	space_info = btrfs_find_space_info(fs_info, flags);
3938
	if (!space_info) {
3939
		btrfs_err(fs_info, "No space info for %llu", flags);
3940 3941
		return -ENOSPC;
	}
J
Josef Bacik 已提交
3942

3943
	/*
3944 3945 3946 3947 3948 3949 3950 3951
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
3952
	 */
3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963
	if (unlikely(space_info->max_extent_size)) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			use_cluster = false;
		}
		spin_unlock(&space_info->lock);
3964
	}
J
Josef Bacik 已提交
3965

3966 3967
	last_ptr = fetch_cluster_info(fs_info, space_info,
				      &ffe_ctl.empty_cluster);
3968
	if (last_ptr) {
3969 3970 3971
		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			hint_byte = last_ptr->window_start;
3972 3973 3974 3975 3976 3977 3978 3979 3980
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			hint_byte = last_ptr->window_start;
			use_cluster = false;
		}
3981
		spin_unlock(&last_ptr->lock);
3982
	}
3983

3984 3985 3986 3987 3988 3989
	ffe_ctl.search_start = max(ffe_ctl.search_start,
				   first_logical_byte(fs_info, 0));
	ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
	if (ffe_ctl.search_start == hint_byte) {
		block_group = btrfs_lookup_block_group(fs_info,
						       ffe_ctl.search_start);
J
Josef Bacik 已提交
3990 3991 3992
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
3993 3994 3995
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
3996
		 */
3997
		if (block_group && block_group_bits(block_group, flags) &&
3998
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
3999
			down_read(&space_info->groups_sem);
4000 4001 4002 4003 4004 4005 4006 4007 4008 4009
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
4010
			} else {
4011
				ffe_ctl.index = btrfs_bg_flags_to_raid_index(
4012
						block_group->flags);
4013
				btrfs_lock_block_group(block_group, delalloc);
4014
				goto have_block_group;
4015
			}
J
Josef Bacik 已提交
4016
		} else if (block_group) {
4017
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
4018
		}
4019
	}
J
Josef Bacik 已提交
4020
search:
4021 4022 4023
	ffe_ctl.have_caching_bg = false;
	if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
	    ffe_ctl.index == 0)
4024
		full_search = true;
4025
	down_read(&space_info->groups_sem);
4026 4027
	list_for_each_entry(block_group,
			    &space_info->block_groups[ffe_ctl.index], list) {
4028 4029 4030 4031
		/* If the block group is read-only, we can skip it entirely. */
		if (unlikely(block_group->ro))
			continue;

4032
		btrfs_grab_block_group(block_group, delalloc);
4033
		ffe_ctl.search_start = block_group->key.objectid;
4034

4035 4036 4037 4038 4039
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
4040
		if (!block_group_bits(block_group, flags)) {
4041
			u64 extra = BTRFS_BLOCK_GROUP_DUP |
4042
				BTRFS_BLOCK_GROUP_RAID1_MASK |
4043
				BTRFS_BLOCK_GROUP_RAID56_MASK |
4044 4045 4046 4047 4048 4049 4050
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
4051
			if ((flags & extra) && !(block_group->flags & extra))
4052
				goto loop;
4053 4054 4055 4056 4057 4058 4059 4060

			/*
			 * This block group has different flags than we want.
			 * It's possible that we have MIXED_GROUP flag but no
			 * block group is mixed.  Just skip such block group.
			 */
			btrfs_release_block_group(block_group, delalloc);
			continue;
4061 4062
		}

J
Josef Bacik 已提交
4063
have_block_group:
4064
		ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
4065 4066
		if (unlikely(!ffe_ctl.cached)) {
			ffe_ctl.have_caching_bg = true;
4067
			ret = btrfs_cache_block_group(block_group, 0);
4068 4069
			BUG_ON(ret < 0);
			ret = 0;
J
Josef Bacik 已提交
4070 4071
		}

4072 4073
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
J
Josef Bacik 已提交
4074

4075
		/*
4076 4077
		 * Ok we want to try and use the cluster allocator, so
		 * lets look there
4078
		 */
4079
		if (last_ptr && use_cluster) {
4080
			struct btrfs_block_group_cache *cluster_bg = NULL;
4081

4082 4083
			ret = find_free_extent_clustered(block_group, last_ptr,
							 &ffe_ctl, &cluster_bg);
4084

4085
			if (ret == 0) {
4086 4087 4088 4089
				if (cluster_bg && cluster_bg != block_group) {
					btrfs_release_block_group(block_group,
								  delalloc);
					block_group = cluster_bg;
4090
				}
4091 4092
				goto checks;
			} else if (ret == -EAGAIN) {
J
Josef Bacik 已提交
4093
				goto have_block_group;
4094 4095
			} else if (ret > 0) {
				goto loop;
4096
			}
4097
			/* ret == -ENOENT case falls through */
4098 4099
		}

4100 4101 4102
		ret = find_free_extent_unclustered(block_group, last_ptr,
						   &ffe_ctl);
		if (ret == -EAGAIN)
J
Josef Bacik 已提交
4103
			goto have_block_group;
4104
		else if (ret > 0)
4105
			goto loop;
4106
		/* ret == 0 case falls through */
4107
checks:
4108 4109
		ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
					     fs_info->stripesize);
4110

J
Josef Bacik 已提交
4111
		/* move on to the next group */
4112
		if (ffe_ctl.search_start + num_bytes >
4113
		    block_group->key.objectid + block_group->key.offset) {
4114 4115
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
4116
			goto loop;
4117
		}
4118

4119 4120 4121
		if (ffe_ctl.found_offset < ffe_ctl.search_start)
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
				ffe_ctl.search_start - ffe_ctl.found_offset);
J
Josef Bacik 已提交
4122

4123 4124
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
4125
		if (ret == -EAGAIN) {
4126 4127
			btrfs_add_free_space(block_group, ffe_ctl.found_offset,
					     num_bytes);
J
Josef Bacik 已提交
4128
			goto loop;
J
Josef Bacik 已提交
4129
		}
4130
		btrfs_inc_block_group_reservations(block_group);
4131

4132
		/* we are all good, lets return */
4133
		ins->objectid = ffe_ctl.search_start;
J
Josef Bacik 已提交
4134
		ins->offset = num_bytes;
4135

4136 4137
		trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
					   num_bytes);
4138
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
4139 4140
		break;
loop:
4141 4142
		ffe_ctl.retry_clustered = false;
		ffe_ctl.retry_unclustered = false;
4143
		BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4144
		       ffe_ctl.index);
4145
		btrfs_release_block_group(block_group, delalloc);
4146
		cond_resched();
J
Josef Bacik 已提交
4147 4148 4149
	}
	up_read(&space_info->groups_sem);

4150 4151 4152
	ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
					   full_search, use_cluster);
	if (ret > 0)
4153 4154
		goto search;

4155
	if (ret == -ENOSPC) {
4156 4157 4158 4159 4160 4161
		/*
		 * Use ffe_ctl->total_free_space as fallback if we can't find
		 * any contiguous hole.
		 */
		if (!ffe_ctl.max_extent_size)
			ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4162
		spin_lock(&space_info->lock);
4163
		space_info->max_extent_size = ffe_ctl.max_extent_size;
4164
		spin_unlock(&space_info->lock);
4165
		ins->offset = ffe_ctl.max_extent_size;
4166
	}
C
Chris Mason 已提交
4167
	return ret;
4168
}
4169

4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214
/*
 * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
 *			  hole that is at least as big as @num_bytes.
 *
 * @root           -	The root that will contain this extent
 *
 * @ram_bytes      -	The amount of space in ram that @num_bytes take. This
 *			is used for accounting purposes. This value differs
 *			from @num_bytes only in the case of compressed extents.
 *
 * @num_bytes      -	Number of bytes to allocate on-disk.
 *
 * @min_alloc_size -	Indicates the minimum amount of space that the
 *			allocator should try to satisfy. In some cases
 *			@num_bytes may be larger than what is required and if
 *			the filesystem is fragmented then allocation fails.
 *			However, the presence of @min_alloc_size gives a
 *			chance to try and satisfy the smaller allocation.
 *
 * @empty_size     -	A hint that you plan on doing more COW. This is the
 *			size in bytes the allocator should try to find free
 *			next to the block it returns.  This is just a hint and
 *			may be ignored by the allocator.
 *
 * @hint_byte      -	Hint to the allocator to start searching above the byte
 *			address passed. It might be ignored.
 *
 * @ins            -	This key is modified to record the found hole. It will
 *			have the following values:
 *			ins->objectid == start position
 *			ins->flags = BTRFS_EXTENT_ITEM_KEY
 *			ins->offset == the size of the hole.
 *
 * @is_data        -	Boolean flag indicating whether an extent is
 *			allocated for data (true) or metadata (false)
 *
 * @delalloc       -	Boolean flag indicating whether this allocation is for
 *			delalloc or not. If 'true' data_rwsem of block groups
 *			is going to be acquired.
 *
 *
 * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
 * case -ENOSPC is returned then @ins->offset will contain the size of the
 * largest available hole the allocator managed to find.
 */
4215
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4216 4217
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
4218
			 struct btrfs_key *ins, int is_data, int delalloc)
4219
{
4220
	struct btrfs_fs_info *fs_info = root->fs_info;
4221
	bool final_tried = num_bytes == min_alloc_size;
4222
	u64 flags;
4223
	int ret;
4224

4225
	flags = get_alloc_profile_by_root(root, is_data);
4226
again:
4227
	WARN_ON(num_bytes < fs_info->sectorsize);
4228
	ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size,
4229
			       hint_byte, ins, flags, delalloc);
4230
	if (!ret && !is_data) {
4231
		btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4232
	} else if (ret == -ENOSPC) {
4233 4234
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
4235
			num_bytes = round_down(num_bytes,
4236
					       fs_info->sectorsize);
4237
			num_bytes = max(num_bytes, min_alloc_size);
4238
			ram_bytes = num_bytes;
4239 4240 4241
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
4242
		} else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4243 4244
			struct btrfs_space_info *sinfo;

4245
			sinfo = btrfs_find_space_info(fs_info, flags);
4246
			btrfs_err(fs_info,
J
Jeff Mahoney 已提交
4247 4248
				  "allocation failed flags %llu, wanted %llu",
				  flags, num_bytes);
4249
			if (sinfo)
4250 4251
				btrfs_dump_space_info(fs_info, sinfo,
						      num_bytes, 1);
4252
		}
4253
	}
J
Josef Bacik 已提交
4254 4255

	return ret;
4256 4257
}

4258
static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4259 4260
					u64 start, u64 len,
					int pin, int delalloc)
4261
{
J
Josef Bacik 已提交
4262
	struct btrfs_block_group_cache *cache;
4263
	int ret = 0;
J
Josef Bacik 已提交
4264

4265
	cache = btrfs_lookup_block_group(fs_info, start);
J
Josef Bacik 已提交
4266
	if (!cache) {
4267 4268
		btrfs_err(fs_info, "Unable to find block group for %llu",
			  start);
J
Josef Bacik 已提交
4269 4270
		return -ENOSPC;
	}
4271

4272
	if (pin)
4273
		pin_down_extent(cache, start, len, 1);
4274
	else {
4275
		if (btrfs_test_opt(fs_info, DISCARD))
4276
			ret = btrfs_discard_extent(fs_info, start, len, NULL);
4277
		btrfs_add_free_space(cache, start, len);
4278
		btrfs_free_reserved_bytes(cache, len, delalloc);
4279
		trace_btrfs_reserved_extent_free(fs_info, start, len);
4280
	}
4281

4282
	btrfs_put_block_group(cache);
4283 4284 4285
	return ret;
}

4286
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4287
			       u64 start, u64 len, int delalloc)
4288
{
4289
	return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
4290 4291
}

4292
int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
4293 4294
				       u64 start, u64 len)
{
4295
	return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
4296 4297
}

4298 4299 4300 4301
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
4302
{
4303
	struct btrfs_fs_info *fs_info = trans->fs_info;
4304 4305
	int ret;
	struct btrfs_extent_item *extent_item;
4306
	struct btrfs_extent_inline_ref *iref;
4307
	struct btrfs_path *path;
4308 4309 4310
	struct extent_buffer *leaf;
	int type;
	u32 size;
4311

4312 4313 4314 4315
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
4316

4317
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4318 4319

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4320 4321
	if (!path)
		return -ENOMEM;
4322

4323
	path->leave_spinning = 1;
4324 4325
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
4326 4327 4328 4329
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
4330

4331 4332
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
4333
				     struct btrfs_extent_item);
4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
4354 4355

	btrfs_mark_buffer_dirty(path->nodes[0]);
4356
	btrfs_free_path(path);
4357

4358
	ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4359 4360 4361
	if (ret)
		return ret;

4362
	ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4363
	if (ret) { /* -ENOENT, logic error */
4364
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4365
			ins->objectid, ins->offset);
4366 4367
		BUG();
	}
4368
	trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4369 4370 4371
	return ret;
}

4372
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4373
				     struct btrfs_delayed_ref_node *node,
4374
				     struct btrfs_delayed_extent_op *extent_op)
4375
{
4376
	struct btrfs_fs_info *fs_info = trans->fs_info;
4377
	int ret;
4378
	struct btrfs_extent_item *extent_item;
4379
	struct btrfs_key extent_key;
4380 4381 4382 4383
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
4384
	struct btrfs_delayed_tree_ref *ref;
4385
	u32 size = sizeof(*extent_item) + sizeof(*iref);
4386
	u64 num_bytes;
4387
	u64 flags = extent_op->flags_to_set;
4388
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4389

4390 4391 4392 4393 4394 4395 4396 4397 4398 4399
	ref = btrfs_delayed_node_to_tree_ref(node);

	extent_key.objectid = node->bytenr;
	if (skinny_metadata) {
		extent_key.offset = ref->level;
		extent_key.type = BTRFS_METADATA_ITEM_KEY;
		num_bytes = fs_info->nodesize;
	} else {
		extent_key.offset = node->num_bytes;
		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4400
		size += sizeof(*block_info);
4401 4402
		num_bytes = node->num_bytes;
	}
4403

4404
	path = btrfs_alloc_path();
4405
	if (!path)
4406
		return -ENOMEM;
4407

4408 4409
	path->leave_spinning = 1;
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4410
				      &extent_key, size);
4411
	if (ret) {
4412
		btrfs_free_path(path);
4413 4414
		return ret;
	}
4415 4416 4417 4418 4419 4420 4421 4422 4423

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

4424 4425 4426 4427
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4428
		btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4429
		btrfs_set_tree_block_level(leaf, block_info, ref->level);
4430 4431
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
4432

4433
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4434 4435 4436
		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
4437
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4438 4439 4440
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
4441
		btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4442 4443 4444 4445 4446
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

4447 4448
	ret = remove_from_free_space_tree(trans, extent_key.objectid,
					  num_bytes);
4449 4450 4451
	if (ret)
		return ret;

4452 4453
	ret = btrfs_update_block_group(trans, extent_key.objectid,
				       fs_info->nodesize, 1);
4454
	if (ret) { /* -ENOENT, logic error */
4455
		btrfs_err(fs_info, "update block group failed for %llu %llu",
4456
			extent_key.objectid, extent_key.offset);
4457 4458
		BUG();
	}
J
Josef Bacik 已提交
4459

4460
	trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4461
					  fs_info->nodesize);
4462 4463 4464 4465
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4466
				     struct btrfs_root *root, u64 owner,
4467 4468
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
4469
{
4470
	struct btrfs_ref generic_ref = { 0 };
4471 4472
	int ret;

4473
	BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4474

4475 4476 4477
	btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
			       ins->objectid, ins->offset, 0);
	btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4478
	btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4479 4480
	ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
					 ram_bytes, NULL, NULL);
4481 4482
	return ret;
}
4483 4484 4485 4486 4487 4488

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
4489 4490 4491
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
4492
{
4493
	struct btrfs_fs_info *fs_info = trans->fs_info;
4494 4495
	int ret;
	struct btrfs_block_group_cache *block_group;
4496
	struct btrfs_space_info *space_info;
4497

4498 4499
	/*
	 * Mixed block groups will exclude before processing the log so we only
4500
	 * need to do the exclude dance if this fs isn't mixed.
4501
	 */
4502
	if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4503 4504
		ret = __exclude_logged_extent(fs_info, ins->objectid,
					      ins->offset);
4505
		if (ret)
4506
			return ret;
4507 4508
	}

4509
	block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4510 4511 4512
	if (!block_group)
		return -EINVAL;

4513 4514 4515 4516 4517 4518 4519 4520
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

4521 4522
	ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
					 offset, ins, 1);
4523
	btrfs_put_block_group(block_group);
4524 4525 4526
	return ret;
}

4527 4528
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4529
		      u64 bytenr, int level, u64 owner)
4530
{
4531
	struct btrfs_fs_info *fs_info = root->fs_info;
4532 4533
	struct extent_buffer *buf;

4534
	buf = btrfs_find_create_tree_block(fs_info, bytenr);
4535 4536 4537
	if (IS_ERR(buf))
		return buf;

4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550
	/*
	 * Extra safety check in case the extent tree is corrupted and extent
	 * allocator chooses to use a tree block which is already used and
	 * locked.
	 */
	if (buf->lock_owner == current->pid) {
		btrfs_err_rl(fs_info,
"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
			buf->start, btrfs_header_owner(buf), current->pid);
		free_extent_buffer(buf);
		return ERR_PTR(-EUCLEAN);
	}

4551
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
4552
	btrfs_tree_lock(buf);
4553
	btrfs_clean_tree_block(buf);
4554
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4555

4556
	btrfs_set_lock_blocking_write(buf);
4557
	set_extent_buffer_uptodate(buf);
4558

4559 4560 4561 4562 4563 4564
	memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
	btrfs_set_header_level(buf, level);
	btrfs_set_header_bytenr(buf, buf->start);
	btrfs_set_header_generation(buf, trans->transid);
	btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
	btrfs_set_header_owner(buf, owner);
4565
	write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4566
	write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4567
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4568
		buf->log_index = root->log_transid % 2;
4569 4570
		/*
		 * we allow two log transactions at a time, use different
4571
		 * EXTENT bit to differentiate dirty pages.
4572
		 */
4573
		if (buf->log_index == 0)
4574 4575 4576 4577
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
4578
					buf->start + buf->len - 1);
4579
	} else {
4580
		buf->log_index = -1;
4581
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4582
			 buf->start + buf->len - 1, GFP_NOFS);
4583
	}
4584
	trans->dirty = true;
4585
	/* this returns a buffer locked for blocking */
4586 4587 4588
	return buf;
}

4589
/*
4590
 * finds a free extent and does all the dirty work required for allocation
4591
 * returns the tree buffer or an ERR_PTR on error.
4592
 */
4593
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4594 4595 4596 4597 4598
					     struct btrfs_root *root,
					     u64 parent, u64 root_objectid,
					     const struct btrfs_disk_key *key,
					     int level, u64 hint,
					     u64 empty_size)
4599
{
4600
	struct btrfs_fs_info *fs_info = root->fs_info;
C
Chris Mason 已提交
4601
	struct btrfs_key ins;
4602
	struct btrfs_block_rsv *block_rsv;
4603
	struct extent_buffer *buf;
4604
	struct btrfs_delayed_extent_op *extent_op;
4605
	struct btrfs_ref generic_ref = { 0 };
4606 4607
	u64 flags = 0;
	int ret;
4608 4609
	u32 blocksize = fs_info->nodesize;
	bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4610

4611
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4612
	if (btrfs_is_testing(fs_info)) {
4613
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4614
					    level, root_objectid);
4615 4616 4617 4618
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
4619
#endif
4620

4621
	block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4622 4623 4624
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

4625
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4626
				   empty_size, hint, &ins, 0, 0);
4627 4628
	if (ret)
		goto out_unuse;
4629

4630 4631
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
				    root_objectid);
4632 4633 4634 4635
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
4636 4637 4638 4639 4640 4641 4642 4643 4644

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4645
		extent_op = btrfs_alloc_delayed_extent_op();
4646 4647 4648 4649
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
4650 4651 4652 4653 4654
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
4655 4656 4657
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
4658
		extent_op->level = level;
4659

4660 4661 4662 4663
		btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
				       ins.objectid, ins.offset, parent);
		generic_ref.real_root = root->root_key.objectid;
		btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4664
		btrfs_ref_tree_mod(fs_info, &generic_ref);
4665
		ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
4666
						 extent_op, NULL, NULL);
4667 4668
		if (ret)
			goto out_free_delayed;
4669
	}
4670
	return buf;
4671 4672 4673 4674 4675 4676

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
4677
	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4678
out_unuse:
4679
	btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4680
	return ERR_PTR(ret);
4681
}
4682

4683 4684 4685 4686
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
4687 4688
	struct btrfs_key drop_progress;
	int drop_level;
4689 4690 4691 4692 4693
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
4694 4695
	int reada_slot;
	int reada_count;
4696
	int restarted;
4697 4698 4699 4700 4701
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
4702 4703 4704 4705
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
4706
{
4707
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4708 4709 4710
	u64 bytenr;
	u64 generation;
	u64 refs;
4711
	u64 flags;
4712
	u32 nritems;
Y
Yan, Zheng 已提交
4713 4714
	struct btrfs_key key;
	struct extent_buffer *eb;
4715
	int ret;
Y
Yan, Zheng 已提交
4716 4717
	int slot;
	int nread = 0;
4718

Y
Yan, Zheng 已提交
4719 4720 4721 4722 4723 4724
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
4725
					BTRFS_NODEPTRS_PER_BLOCK(fs_info));
Y
Yan, Zheng 已提交
4726
	}
4727

Y
Yan, Zheng 已提交
4728 4729
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
4730

Y
Yan, Zheng 已提交
4731 4732 4733
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
4734

C
Chris Mason 已提交
4735
		cond_resched();
Y
Yan, Zheng 已提交
4736 4737
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
4738

Y
Yan, Zheng 已提交
4739 4740
		if (slot == path->slots[wc->level])
			goto reada;
4741

Y
Yan, Zheng 已提交
4742 4743
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
4744 4745
			continue;

4746
		/* We don't lock the tree block, it's OK to be racy here */
4747
		ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4748 4749
					       wc->level - 1, 1, &refs,
					       &flags);
4750 4751 4752
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
4753 4754
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
4755 4756 4757
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
4758

4759 4760 4761
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
4762 4763 4764 4765 4766 4767 4768 4769
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
4770 4771 4772 4773
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
4774
		}
Y
Yan, Zheng 已提交
4775
reada:
4776
		readahead_tree_block(fs_info, bytenr);
Y
Yan, Zheng 已提交
4777
		nread++;
C
Chris Mason 已提交
4778
	}
Y
Yan, Zheng 已提交
4779
	wc->reada_slot = slot;
C
Chris Mason 已提交
4780
}
4781

Y
Yan Zheng 已提交
4782
/*
L
Liu Bo 已提交
4783
 * helper to process tree block while walking down the tree.
4784 4785 4786 4787 4788
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
4789
 */
4790
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4791
				   struct btrfs_root *root,
4792
				   struct btrfs_path *path,
4793
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
4794
{
4795
	struct btrfs_fs_info *fs_info = root->fs_info;
4796 4797 4798
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
4799 4800
	int ret;

4801 4802 4803
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
4804

4805 4806 4807 4808
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
4809 4810 4811
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4812
		BUG_ON(!path->locks[level]);
4813
		ret = btrfs_lookup_extent_info(trans, fs_info,
4814
					       eb->start, level, 1,
4815 4816
					       &wc->refs[level],
					       &wc->flags[level]);
4817 4818 4819
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
4820 4821
		BUG_ON(wc->refs[level] == 0);
	}
4822

4823 4824 4825
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
4826

4827
		if (path->locks[level] && !wc->keep_locks) {
4828
			btrfs_tree_unlock_rw(eb, path->locks[level]);
4829 4830 4831 4832
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
4833

4834 4835 4836
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
4837
		ret = btrfs_inc_ref(trans, root, eb, 1);
4838
		BUG_ON(ret); /* -ENOMEM */
4839
		ret = btrfs_dec_ref(trans, root, eb, 0);
4840
		BUG_ON(ret); /* -ENOMEM */
4841
		ret = btrfs_set_disk_extent_flags(trans, eb->start,
4842 4843
						  eb->len, flag,
						  btrfs_header_level(eb), 0);
4844
		BUG_ON(ret); /* -ENOMEM */
4845 4846 4847 4848 4849 4850 4851 4852
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
4853
		btrfs_tree_unlock_rw(eb, path->locks[level]);
4854 4855 4856 4857 4858
		path->locks[level] = 0;
	}
	return 0;
}

4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885
/*
 * This is used to verify a ref exists for this root to deal with a bug where we
 * would have a drop_progress key that hadn't been updated properly.
 */
static int check_ref_exists(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 bytenr, u64 parent,
			    int level)
{
	struct btrfs_path *path;
	struct btrfs_extent_inline_ref *iref;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = lookup_extent_backref(trans, path, &iref, bytenr,
				    root->fs_info->nodesize, parent,
				    root->root_key.objectid, level, 0);
	btrfs_free_path(path);
	if (ret == -ENOENT)
		return 0;
	if (ret < 0)
		return ret;
	return 1;
}

Y
Yan, Zheng 已提交
4886
/*
L
Liu Bo 已提交
4887
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
4902
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
4903
{
4904
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan, Zheng 已提交
4905 4906 4907 4908
	u64 bytenr;
	u64 generation;
	u64 parent;
	struct btrfs_key key;
4909
	struct btrfs_key first_key;
4910
	struct btrfs_ref ref = { 0 };
Y
Yan, Zheng 已提交
4911 4912 4913 4914
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
4915
	bool need_account = false;
Y
Yan, Zheng 已提交
4916 4917 4918 4919 4920 4921 4922 4923 4924

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
4925 4926
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4927
		return 1;
4928
	}
Y
Yan, Zheng 已提交
4929 4930

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4931 4932
	btrfs_node_key_to_cpu(path->nodes[level], &first_key,
			      path->slots[level]);
Y
Yan, Zheng 已提交
4933

4934
	next = find_extent_buffer(fs_info, bytenr);
Y
Yan, Zheng 已提交
4935
	if (!next) {
4936
		next = btrfs_find_create_tree_block(fs_info, bytenr);
4937 4938 4939
		if (IS_ERR(next))
			return PTR_ERR(next);

4940 4941
		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
					       level - 1);
Y
Yan, Zheng 已提交
4942 4943 4944
		reada = 1;
	}
	btrfs_tree_lock(next);
4945
	btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
4946

4947
	ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
4948 4949
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
4950 4951
	if (ret < 0)
		goto out_unlock;
4952

4953
	if (unlikely(wc->refs[level - 1] == 0)) {
4954
		btrfs_err(fs_info, "Missing references.");
4955 4956
		ret = -EIO;
		goto out_unlock;
4957
	}
4958
	*lookup_info = 0;
Y
Yan, Zheng 已提交
4959

4960
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
4961
		if (wc->refs[level - 1] > 1) {
4962
			need_account = true;
4963 4964 4965 4966
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
4980 4981 4982 4983
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
4984 4985
	}

4986
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
4987 4988 4989
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
4990
		*lookup_info = 1;
Y
Yan, Zheng 已提交
4991 4992 4993 4994 4995
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
4996 4997
		next = read_tree_block(fs_info, bytenr, generation, level - 1,
				       &first_key);
4998 4999 5000
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
5001
			free_extent_buffer(next);
5002
			return -EIO;
5003
		}
Y
Yan, Zheng 已提交
5004
		btrfs_tree_lock(next);
5005
		btrfs_set_lock_blocking_write(next);
Y
Yan, Zheng 已提交
5006 5007 5008
	}

	level--;
5009 5010 5011 5012 5013 5014
	ASSERT(level == btrfs_header_level(next));
	if (level != btrfs_header_level(next)) {
		btrfs_err(root->fs_info, "mismatched level");
		ret = -EIO;
		goto out_unlock;
	}
Y
Yan, Zheng 已提交
5015 5016
	path->nodes[level] = next;
	path->slots[level] = 0;
5017
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
Y
Yan, Zheng 已提交
5018 5019 5020 5021 5022 5023 5024
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
5025 5026 5027 5028
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
5029
			ASSERT(root->root_key.objectid ==
5030
			       btrfs_header_owner(path->nodes[level]));
5031 5032 5033 5034 5035 5036 5037
			if (root->root_key.objectid !=
			    btrfs_header_owner(path->nodes[level])) {
				btrfs_err(root->fs_info,
						"mismatched block owner");
				ret = -EIO;
				goto out_unlock;
			}
5038 5039
			parent = 0;
		}
Y
Yan, Zheng 已提交
5040

5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057
		/*
		 * If we had a drop_progress we need to verify the refs are set
		 * as expected.  If we find our ref then we know that from here
		 * on out everything should be correct, and we can clear the
		 * ->restarted flag.
		 */
		if (wc->restarted) {
			ret = check_ref_exists(trans, root, bytenr, parent,
					       level - 1);
			if (ret < 0)
				goto out_unlock;
			if (ret == 0)
				goto no_delete;
			ret = 0;
			wc->restarted = 0;
		}

5058 5059 5060 5061 5062 5063 5064
		/*
		 * Reloc tree doesn't contribute to qgroup numbers, and we have
		 * already accounted them at merge time (replace_path),
		 * thus we could skip expensive subtree trace here.
		 */
		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
		    need_account) {
5065
			ret = btrfs_qgroup_trace_subtree(trans, next,
5066
							 generation, level - 1);
5067
			if (ret) {
5068
				btrfs_err_rl(fs_info,
J
Jeff Mahoney 已提交
5069 5070
					     "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
					     ret);
5071 5072
			}
		}
5073 5074 5075 5076 5077 5078 5079 5080 5081 5082

		/*
		 * We need to update the next key in our walk control so we can
		 * update the drop_progress key accordingly.  We don't care if
		 * find_next_key doesn't find a key because that means we're at
		 * the end and are going to clean up now.
		 */
		wc->drop_level = level;
		find_next_key(path, level, &wc->drop_progress);

5083 5084 5085 5086
		btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
				       fs_info->nodesize, parent);
		btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
		ret = btrfs_free_extent(trans, &ref);
5087 5088
		if (ret)
			goto out_unlock;
Y
Yan, Zheng 已提交
5089
	}
5090
no_delete:
5091 5092 5093 5094
	*lookup_info = 1;
	ret = 1;

out_unlock:
Y
Yan, Zheng 已提交
5095 5096
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
5097 5098

	return ret;
Y
Yan, Zheng 已提交
5099 5100
}

5101
/*
L
Liu Bo 已提交
5102
 * helper to process tree block while walking up the tree.
5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
5118
	struct btrfs_fs_info *fs_info = root->fs_info;
5119
	int ret;
5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
5145
			btrfs_set_lock_blocking_write(eb);
5146
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5147

5148
			ret = btrfs_lookup_extent_info(trans, fs_info,
5149
						       eb->start, level, 1,
5150 5151
						       &wc->refs[level],
						       &wc->flags[level]);
5152 5153
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5154
				path->locks[level] = 0;
5155 5156
				return ret;
			}
5157 5158
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
5159
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
5160
				path->locks[level] = 0;
5161 5162
				return 1;
			}
Y
Yan Zheng 已提交
5163
		}
5164
	}
Y
Yan Zheng 已提交
5165

5166 5167
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5168

5169 5170 5171
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5172
				ret = btrfs_dec_ref(trans, root, eb, 1);
5173
			else
5174
				ret = btrfs_dec_ref(trans, root, eb, 0);
5175
			BUG_ON(ret); /* -ENOMEM */
5176 5177 5178 5179 5180
			if (is_fstree(root->root_key.objectid)) {
				ret = btrfs_qgroup_trace_leaf_items(trans, eb);
				if (ret) {
					btrfs_err_rl(fs_info,
	"error %d accounting leaf items, quota is out of sync, rescan required",
J
Jeff Mahoney 已提交
5181
					     ret);
5182
				}
5183
			}
5184
		}
5185
		/* make block locked assertion in btrfs_clean_tree_block happy */
5186 5187 5188
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
5189
			btrfs_set_lock_blocking_write(eb);
5190
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5191
		}
5192
		btrfs_clean_tree_block(eb);
5193 5194 5195 5196 5197
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
5198 5199
		else if (root->root_key.objectid != btrfs_header_owner(eb))
			goto owner_mismatch;
5200 5201 5202
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
5203 5204 5205
		else if (root->root_key.objectid !=
			 btrfs_header_owner(path->nodes[level + 1]))
			goto owner_mismatch;
Y
Yan Zheng 已提交
5206 5207
	}

5208
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5209 5210 5211
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
5212
	return 0;
5213 5214 5215 5216 5217

owner_mismatch:
	btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
		     btrfs_header_owner(eb), root->root_key.objectid);
	return -EUCLEAN;
5218 5219 5220 5221 5222 5223 5224 5225
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
5226
	int lookup_info = 1;
5227 5228 5229
	int ret;

	while (level >= 0) {
5230
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
5231 5232 5233 5234 5235 5236
		if (ret > 0)
			break;

		if (level == 0)
			break;

5237 5238 5239 5240
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

5241
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
5242 5243 5244
		if (ret > 0) {
			path->slots[level]++;
			continue;
5245 5246
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
5247
		level = wc->level;
Y
Yan Zheng 已提交
5248 5249 5250 5251
	}
	return 0;
}

C
Chris Mason 已提交
5252
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5253
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
5254
				 struct btrfs_path *path,
5255
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
5256
{
5257
	int level = wc->level;
C
Chris Mason 已提交
5258
	int ret;
5259

5260 5261 5262 5263 5264 5265
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
5266 5267
			return 0;
		} else {
5268 5269 5270
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
5271 5272
			if (ret < 0)
				return ret;
5273

5274
			if (path->locks[level]) {
5275 5276
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
5277
				path->locks[level] = 0;
Y
Yan Zheng 已提交
5278
			}
5279 5280 5281
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
5282 5283 5284 5285 5286
		}
	}
	return 1;
}

C
Chris Mason 已提交
5287
/*
5288 5289 5290 5291 5292 5293 5294 5295 5296
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
5297 5298
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
5299
 */
5300
int btrfs_drop_snapshot(struct btrfs_root *root,
A
Arne Jansen 已提交
5301 5302
			 struct btrfs_block_rsv *block_rsv, int update_ref,
			 int for_reloc)
C
Chris Mason 已提交
5303
{
5304
	struct btrfs_fs_info *fs_info = root->fs_info;
5305
	struct btrfs_path *path;
5306
	struct btrfs_trans_handle *trans;
5307
	struct btrfs_root *tree_root = fs_info->tree_root;
5308
	struct btrfs_root_item *root_item = &root->root_item;
5309 5310 5311 5312 5313
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
5314
	bool root_dropped = false;
C
Chris Mason 已提交
5315

5316
	btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5317

5318
	path = btrfs_alloc_path();
5319 5320 5321 5322
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
5323

5324
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
5325 5326
	if (!wc) {
		btrfs_free_path(path);
5327 5328
		err = -ENOMEM;
		goto out;
5329
	}
5330

5331
	trans = btrfs_start_transaction(tree_root, 0);
5332 5333 5334 5335
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
5336

5337 5338 5339 5340
	err = btrfs_run_delayed_items(trans);
	if (err)
		goto out_end_trans;

5341 5342
	if (block_rsv)
		trans->block_rsv = block_rsv;
5343

5344 5345 5346 5347 5348 5349 5350 5351 5352
	/*
	 * This will help us catch people modifying the fs tree while we're
	 * dropping it.  It is unsafe to mess with the fs tree while it's being
	 * dropped as we unlock the root node and parent nodes as we walk down
	 * the tree, assuming nothing will change.  If something does change
	 * then we'll have stale information and drop references to blocks we've
	 * already dropped.
	 */
	set_bit(BTRFS_ROOT_DELETING, &root->state);
5353
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5354
		level = btrfs_header_level(root->node);
5355
		path->nodes[level] = btrfs_lock_root_node(root);
5356
		btrfs_set_lock_blocking_write(path->nodes[level]);
5357
		path->slots[level] = 0;
5358
		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5359 5360
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
5361 5362
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5363 5364 5365
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

5366
		level = root_item->drop_level;
5367
		BUG_ON(level == 0);
5368
		path->lowest_level = level;
5369 5370 5371 5372
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
5373
			goto out_end_trans;
5374
		}
Y
Yan, Zheng 已提交
5375
		WARN_ON(ret > 0);
5376

5377 5378 5379 5380
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
5381
		btrfs_unlock_up_safe(path, 0);
5382 5383 5384 5385

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
5386
			btrfs_set_lock_blocking_write(path->nodes[level]);
5387
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5388

5389
			ret = btrfs_lookup_extent_info(trans, fs_info,
5390
						path->nodes[level]->start,
5391
						level, 1, &wc->refs[level],
5392
						&wc->flags[level]);
5393 5394 5395 5396
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
5397 5398 5399 5400 5401 5402
			BUG_ON(wc->refs[level] == 0);

			if (level == root_item->drop_level)
				break;

			btrfs_tree_unlock(path->nodes[level]);
5403
			path->locks[level] = 0;
5404 5405 5406
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
5407
	}
5408

5409
	wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5410 5411 5412 5413 5414
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
5415
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5416

C
Chris Mason 已提交
5417
	while (1) {
D
David Sterba 已提交
5418

5419 5420 5421
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5422
			break;
5423
		}
C
Chris Mason 已提交
5424

5425 5426 5427
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
5428
			break;
5429 5430 5431 5432
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
5433 5434
			break;
		}
5435 5436

		if (wc->stage == DROP_REFERENCE) {
5437 5438 5439 5440 5441 5442 5443 5444
			wc->drop_level = wc->level;
			btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
					      &wc->drop_progress,
					      path->slots[wc->drop_level]);
		}
		btrfs_cpu_key_to_disk(&root_item->drop_progress,
				      &wc->drop_progress);
		root_item->drop_level = wc->drop_level;
5445 5446

		BUG_ON(wc->level == 0);
5447
		if (btrfs_should_end_transaction(trans) ||
5448
		    (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5449 5450 5451
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
5452
			if (ret) {
5453
				btrfs_abort_transaction(trans, ret);
5454 5455 5456
				err = ret;
				goto out_end_trans;
			}
5457

5458
			btrfs_end_transaction_throttle(trans);
5459
			if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5460 5461
				btrfs_debug(fs_info,
					    "drop snapshot early exit");
5462 5463 5464 5465
				err = -EAGAIN;
				goto out_free;
			}

5466
			trans = btrfs_start_transaction(tree_root, 0);
5467 5468 5469 5470
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
5471 5472
			if (block_rsv)
				trans->block_rsv = block_rsv;
5473
		}
C
Chris Mason 已提交
5474
	}
5475
	btrfs_release_path(path);
5476 5477
	if (err)
		goto out_end_trans;
5478

5479
	ret = btrfs_del_root(trans, &root->root_key);
5480
	if (ret) {
5481
		btrfs_abort_transaction(trans, ret);
5482
		err = ret;
5483 5484
		goto out_end_trans;
	}
5485

5486
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5487 5488
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
5489
		if (ret < 0) {
5490
			btrfs_abort_transaction(trans, ret);
5491 5492 5493
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
5494 5495 5496 5497 5498 5499 5500
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
5501 5502 5503
		}
	}

5504
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5505
		btrfs_add_dropped_root(trans, root);
5506 5507 5508
	} else {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
5509
		btrfs_put_fs_root(root);
5510
	}
5511
	root_dropped = true;
5512
out_end_trans:
5513
	btrfs_end_transaction_throttle(trans);
5514
out_free:
5515
	kfree(wc);
5516
	btrfs_free_path(path);
5517
out:
5518 5519 5520 5521 5522 5523 5524
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
5525
	if (!for_reloc && !root_dropped)
5526
		btrfs_add_dead_root(root);
5527
	if (err && err != -EAGAIN)
5528
		btrfs_handle_fs_error(fs_info, err, NULL);
5529
	return err;
C
Chris Mason 已提交
5530
}
C
Chris Mason 已提交
5531

5532 5533 5534 5535
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
5536
 * only used by relocation code
5537
 */
Y
Yan Zheng 已提交
5538 5539 5540 5541 5542
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
5543
	struct btrfs_fs_info *fs_info = root->fs_info;
Y
Yan Zheng 已提交
5544
	struct btrfs_path *path;
5545
	struct walk_control *wc;
Y
Yan Zheng 已提交
5546 5547 5548 5549 5550
	int level;
	int parent_level;
	int ret = 0;
	int wret;

5551 5552
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
5553
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
5554 5555
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
5556

5557
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
5558 5559 5560 5561
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
5562

5563
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
5564 5565 5566 5567 5568
	parent_level = btrfs_header_level(parent);
	extent_buffer_get(parent);
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

5569
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
5570 5571 5572
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
5573
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5574 5575 5576 5577 5578 5579 5580 5581

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
5582
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
Y
Yan Zheng 已提交
5583 5584

	while (1) {
5585 5586
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
5587 5588
			ret = wret;
			break;
5589
		}
Y
Yan Zheng 已提交
5590

5591
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
5592 5593 5594 5595 5596 5597
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

5598
	kfree(wc);
Y
Yan Zheng 已提交
5599 5600 5601 5602
	btrfs_free_path(path);
	return ret;
}

5603 5604
/*
 * helper to account the unused space of all the readonly block group in the
5605
 * space_info. takes mirrors into account.
5606
 */
5607
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5608 5609 5610 5611 5612
{
	struct btrfs_block_group_cache *block_group;
	u64 free_bytes = 0;
	int factor;

5613
	/* It's df, we don't care if it's racy */
5614 5615 5616 5617 5618
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5619 5620 5621 5622 5623 5624 5625
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

5626
		factor = btrfs_bg_type_to_factor(block_group->flags);
5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637
		free_bytes += (block_group->key.offset -
			       btrfs_block_group_used(&block_group->item)) *
			       factor;

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

5638 5639 5640 5641 5642 5643 5644 5645 5646 5647
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
	struct btrfs_block_group_cache *block_group;
	u64 last = 0;

	while (1) {
		struct inode *inode;

		block_group = btrfs_lookup_first_block_group(info, last);
		while (block_group) {
5648
			btrfs_wait_block_group_cache_done(block_group);
5649 5650 5651 5652
			spin_lock(&block_group->lock);
			if (block_group->iref)
				break;
			spin_unlock(&block_group->lock);
5653
			block_group = btrfs_next_block_group(block_group);
5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665
		}
		if (!block_group) {
			if (last == 0)
				break;
			last = 0;
			continue;
		}

		inode = block_group->inode;
		block_group->iref = 0;
		block_group->inode = NULL;
		spin_unlock(&block_group->lock);
5666
		ASSERT(block_group->io_ctl.inode == NULL);
5667 5668 5669 5670 5671 5672
		iput(inode);
		last = block_group->key.objectid + block_group->key.offset;
		btrfs_put_block_group(block_group);
	}
}

5673 5674 5675 5676 5677
/*
 * Must be called only after stopping all workers, since we could have block
 * group caching kthreads running, and therefore they could race with us if we
 * freed the block groups before stopping them.
 */
Z
Zheng Yan 已提交
5678 5679 5680
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
	struct btrfs_block_group_cache *block_group;
5681
	struct btrfs_space_info *space_info;
5682
	struct btrfs_caching_control *caching_ctl;
Z
Zheng Yan 已提交
5683 5684
	struct rb_node *n;

5685
	down_write(&info->commit_root_sem);
5686 5687 5688 5689
	while (!list_empty(&info->caching_block_groups)) {
		caching_ctl = list_entry(info->caching_block_groups.next,
					 struct btrfs_caching_control, list);
		list_del(&caching_ctl->list);
5690
		btrfs_put_caching_control(caching_ctl);
5691
	}
5692
	up_write(&info->commit_root_sem);
5693

5694 5695 5696 5697 5698 5699 5700 5701 5702 5703
	spin_lock(&info->unused_bgs_lock);
	while (!list_empty(&info->unused_bgs)) {
		block_group = list_first_entry(&info->unused_bgs,
					       struct btrfs_block_group_cache,
					       bg_list);
		list_del_init(&block_group->bg_list);
		btrfs_put_block_group(block_group);
	}
	spin_unlock(&info->unused_bgs_lock);

Z
Zheng Yan 已提交
5704 5705 5706 5707 5708 5709
	spin_lock(&info->block_group_cache_lock);
	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
		block_group = rb_entry(n, struct btrfs_block_group_cache,
				       cache_node);
		rb_erase(&block_group->cache_node,
			 &info->block_group_cache_tree);
5710
		RB_CLEAR_NODE(&block_group->cache_node);
Y
Yan Zheng 已提交
5711 5712
		spin_unlock(&info->block_group_cache_lock);

5713
		down_write(&block_group->space_info->groups_sem);
Z
Zheng Yan 已提交
5714
		list_del(&block_group->list);
5715
		up_write(&block_group->space_info->groups_sem);
5716

5717 5718 5719 5720
		/*
		 * We haven't cached this block group, which means we could
		 * possibly have excluded extents on this block group.
		 */
5721 5722
		if (block_group->cached == BTRFS_CACHE_NO ||
		    block_group->cached == BTRFS_CACHE_ERROR)
5723
			btrfs_free_excluded_extents(block_group);
5724

J
Josef Bacik 已提交
5725
		btrfs_remove_free_space_cache(block_group);
5726
		ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
5727 5728 5729 5730
		ASSERT(list_empty(&block_group->dirty_list));
		ASSERT(list_empty(&block_group->io_list));
		ASSERT(list_empty(&block_group->bg_list));
		ASSERT(atomic_read(&block_group->count) == 1);
5731
		btrfs_put_block_group(block_group);
Y
Yan Zheng 已提交
5732 5733

		spin_lock(&info->block_group_cache_lock);
Z
Zheng Yan 已提交
5734 5735
	}
	spin_unlock(&info->block_group_cache_lock);
5736 5737 5738 5739 5740 5741 5742 5743 5744

	/* now that all the block groups are freed, go through and
	 * free all the space_info structs.  This is only called during
	 * the final stages of unmount, and so we know nobody is
	 * using them.  We call synchronize_rcu() once before we start,
	 * just to be on the safe side.
	 */
	synchronize_rcu();

5745
	btrfs_release_global_block_rsv(info);
5746

5747
	while (!list_empty(&info->space_info)) {
5748 5749 5750
		space_info = list_entry(info->space_info.next,
					struct btrfs_space_info,
					list);
5751 5752 5753 5754 5755 5756

		/*
		 * Do not hide this behind enospc_debug, this is actually
		 * important and indicates a real bug if this happens.
		 */
		if (WARN_ON(space_info->bytes_pinned > 0 ||
5757
			    space_info->bytes_reserved > 0 ||
5758
			    space_info->bytes_may_use > 0))
5759
			btrfs_dump_space_info(info, space_info, 0, 0);
5760
		list_del(&space_info->list);
5761
		btrfs_sysfs_remove_space_info(space_info);
5762
	}
Z
Zheng Yan 已提交
5763 5764 5765
	return 0;
}

5766 5767
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
				   u64 start, u64 end)
L
liubo 已提交
5768
{
5769
	return unpin_extent_range(fs_info, start, end, false);
L
liubo 已提交
5770 5771
}

5772 5773 5774 5775 5776 5777 5778 5779 5780
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
5781 5782
 * to ensure that we're not discarding chunks that were released or
 * allocated in the current transaction.
5783 5784 5785 5786 5787
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
5788 5789 5790
 * transaction and hold the commit root sem.  We only need to hold
 * it while performing the free space search since we have already
 * held back allocations.
5791
 */
5792
static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5793
{
5794
	u64 start = SZ_1M, len = 0, end = 0;
5795 5796 5797 5798
	int ret;

	*trimmed = 0;

5799 5800 5801 5802
	/* Discard not supported = nothing to do. */
	if (!blk_queue_discard(bdev_get_queue(device->bdev)))
		return 0;

5803
	/* Not writable = nothing to do. */
5804
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5805 5806 5807 5808 5809 5810 5811 5812 5813
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
5814
		struct btrfs_fs_info *fs_info = device->fs_info;
5815 5816 5817 5818
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
5819
			break;
5820

5821 5822 5823
		find_first_clear_extent_bit(&device->alloc_state, start,
					    &start, &end,
					    CHUNK_TRIMMED | CHUNK_ALLOCATED);
5824 5825 5826 5827

		/* Ensure we skip the reserved area in the first 1M */
		start = max_t(u64, start, SZ_1M);

5828 5829 5830 5831 5832 5833
		/*
		 * If find_first_clear_extent_bit find a range that spans the
		 * end of the device it will set end to -1, in this case it's up
		 * to the caller to trim the value to the size of the device.
		 */
		end = min(end, device->total_bytes - 1);
5834

5835
		len = end - start + 1;
5836

5837 5838
		/* We didn't find any extents */
		if (!len) {
5839
			mutex_unlock(&fs_info->chunk_mutex);
5840
			ret = 0;
5841 5842 5843
			break;
		}

5844 5845 5846 5847 5848 5849
		ret = btrfs_issue_discard(device->bdev, start, len,
					  &bytes);
		if (!ret)
			set_extent_bits(&device->alloc_state, start,
					start + bytes - 1,
					CHUNK_TRIMMED);
5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

5869 5870 5871 5872 5873 5874 5875 5876 5877
/*
 * Trim the whole filesystem by:
 * 1) trimming the free space in each block group
 * 2) trimming the unallocated space on each device
 *
 * This will also continue trimming even if a block group or device encounters
 * an error.  The return value will be the last error, or 0 if nothing bad
 * happens.
 */
5878
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5879 5880
{
	struct btrfs_block_group_cache *cache = NULL;
5881 5882
	struct btrfs_device *device;
	struct list_head *devices;
5883
	u64 group_trimmed;
5884
	u64 range_end = U64_MAX;
5885 5886 5887
	u64 start;
	u64 end;
	u64 trimmed = 0;
5888 5889 5890 5891
	u64 bg_failed = 0;
	u64 dev_failed = 0;
	int bg_ret = 0;
	int dev_ret = 0;
5892 5893
	int ret = 0;

5894 5895 5896 5897 5898 5899 5900 5901
	/*
	 * Check range overflow if range->len is set.
	 * The default range->len is U64_MAX.
	 */
	if (range->len != U64_MAX &&
	    check_add_overflow(range->start, range->len, &range_end))
		return -EINVAL;

5902
	cache = btrfs_lookup_first_block_group(fs_info, range->start);
5903
	for (; cache; cache = btrfs_next_block_group(cache)) {
5904
		if (cache->key.objectid >= range_end) {
5905 5906 5907 5908 5909
			btrfs_put_block_group(cache);
			break;
		}

		start = max(range->start, cache->key.objectid);
5910
		end = min(range_end, cache->key.objectid + cache->key.offset);
5911 5912

		if (end - start >= range->minlen) {
5913 5914
			if (!btrfs_block_group_cache_done(cache)) {
				ret = btrfs_cache_block_group(cache, 0);
5915
				if (ret) {
5916 5917 5918
					bg_failed++;
					bg_ret = ret;
					continue;
5919
				}
5920
				ret = btrfs_wait_block_group_cache_done(cache);
5921
				if (ret) {
5922 5923 5924
					bg_failed++;
					bg_ret = ret;
					continue;
5925
				}
5926 5927 5928 5929 5930 5931 5932 5933 5934
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
5935 5936 5937
				bg_failed++;
				bg_ret = ret;
				continue;
5938 5939 5940 5941
			}
		}
	}

5942 5943 5944 5945
	if (bg_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu block group(s), last error %d",
			bg_failed, bg_ret);
5946
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
5947 5948
	devices = &fs_info->fs_devices->devices;
	list_for_each_entry(device, devices, dev_list) {
5949
		ret = btrfs_trim_free_extents(device, &group_trimmed);
5950 5951 5952
		if (ret) {
			dev_failed++;
			dev_ret = ret;
5953
			break;
5954
		}
5955 5956 5957

		trimmed += group_trimmed;
	}
5958
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5959

5960 5961 5962 5963
	if (dev_failed)
		btrfs_warn(fs_info,
			"failed to trim %llu device(s), last error %d",
			dev_failed, dev_ret);
5964
	range->len = trimmed;
5965 5966 5967
	if (bg_ret)
		return bg_ret;
	return dev_ret;
5968
}
5969 5970

/*
5971
 * btrfs_{start,end}_write_no_snapshotting() are similar to
5972 5973 5974
 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
 * data into the page cache through nocow before the subvolume is snapshoted,
 * but flush the data into disk after the snapshot creation, or to prevent
5975
 * operations while snapshotting is ongoing and that cause the snapshot to be
5976
 * inconsistent (writes followed by expanding truncates for example).
5977
 */
5978
void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
5979 5980
{
	percpu_counter_dec(&root->subv_writers->counter);
5981
	cond_wake_up(&root->subv_writers->wait);
5982 5983
}

5984
int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
5985
{
5986
	if (atomic_read(&root->will_be_snapshotted))
5987 5988 5989 5990 5991 5992 5993
		return 0;

	percpu_counter_inc(&root->subv_writers->counter);
	/*
	 * Make sure counter is updated before we check for snapshot creation.
	 */
	smp_mb();
5994 5995
	if (atomic_read(&root->will_be_snapshotted)) {
		btrfs_end_write_no_snapshotting(root);
5996 5997 5998 5999
		return 0;
	}
	return 1;
}
6000 6001 6002 6003 6004 6005

void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
	while (true) {
		int ret;

6006
		ret = btrfs_start_write_no_snapshotting(root);
6007 6008
		if (ret)
			break;
6009 6010
		wait_var_event(&root->will_be_snapshotted,
			       !atomic_read(&root->will_be_snapshotted));
6011 6012
	}
}