extent-tree.c 304.0 KB
Newer Older
C
Chris Mason 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */
Z
Zach Brown 已提交
18
#include <linux/sched.h>
19
#include <linux/pagemap.h>
20
#include <linux/writeback.h>
21
#include <linux/blkdev.h>
22
#include <linux/sort.h>
23
#include <linux/rcupdate.h>
J
Josef Bacik 已提交
24
#include <linux/kthread.h>
25
#include <linux/slab.h>
26
#include <linux/ratelimit.h>
27
#include <linux/percpu_counter.h>
28
#include "hash.h"
29
#include "tree-log.h"
30 31
#include "disk-io.h"
#include "print-tree.h"
32
#include "volumes.h"
D
David Woodhouse 已提交
33
#include "raid56.h"
34
#include "locking.h"
35
#include "free-space-cache.h"
36
#include "free-space-tree.h"
37
#include "math.h"
38
#include "sysfs.h"
J
Josef Bacik 已提交
39
#include "qgroup.h"
40

41 42
#undef SCRAMBLE_DELAYED_REFS

43 44
/*
 * control flags for do_chunk_alloc's force field
45 46 47 48 49 50 51 52 53
 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
 * if we really need one.
 *
 * CHUNK_ALLOC_LIMITED means to only try and allocate one
 * if we have very few chunks already allocated.  This is
 * used as part of the clustering code to help make sure
 * we have a good pool of storage to cluster in, without
 * filling the FS with empty chunks
 *
54 55
 * CHUNK_ALLOC_FORCE means it must try to allocate one
 *
56 57 58
 */
enum {
	CHUNK_ALLOC_NO_FORCE = 0,
59 60
	CHUNK_ALLOC_LIMITED = 1,
	CHUNK_ALLOC_FORCE = 2,
61 62
};

63 64 65
static int update_block_group(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root, u64 bytenr,
			      u64 num_bytes, int alloc);
66 67
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
68
				struct btrfs_delayed_ref_node *node, u64 parent,
69 70
				u64 root_objectid, u64 owner_objectid,
				u64 owner_offset, int refs_to_drop,
71
				struct btrfs_delayed_extent_op *extra_op);
72 73 74 75 76 77 78 79 80 81 82 83
static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei);
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod);
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     u64 parent, u64 root_objectid,
				     u64 flags, struct btrfs_disk_key *key,
84
				     int level, struct btrfs_key *ins);
J
Josef Bacik 已提交
85
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
86 87
			  struct btrfs_root *extent_root, u64 flags,
			  int force);
88 89
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key);
J
Josef Bacik 已提交
90 91
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
			    int dump_block_groups);
92
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
93
				    u64 ram_bytes, u64 num_bytes, int delalloc);
94 95
static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
				     u64 num_bytes, int delalloc);
96 97
static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
			       u64 num_bytes);
98 99
int btrfs_pin_extent(struct btrfs_root *root,
		     u64 bytenr, u64 num_bytes, int reserved);
100 101 102 103 104 105 106 107 108 109
static int __reserve_metadata_bytes(struct btrfs_root *root,
				    struct btrfs_space_info *space_info,
				    u64 orig_bytes,
				    enum btrfs_reserve_flush_enum flush);
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				     struct btrfs_space_info *space_info,
				     u64 num_bytes);
static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				     struct btrfs_space_info *space_info,
				     u64 num_bytes);
J
Josef Bacik 已提交
110

J
Josef Bacik 已提交
111 112 113 114
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
{
	smp_mb();
115 116
	return cache->cached == BTRFS_CACHE_FINISHED ||
		cache->cached == BTRFS_CACHE_ERROR;
J
Josef Bacik 已提交
117 118
}

J
Josef Bacik 已提交
119 120 121 122 123
static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
{
	return (cache->flags & bits) == bits;
}

124
void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
125 126 127 128 129 130
{
	atomic_inc(&cache->count);
}

void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
{
131 132 133
	if (atomic_dec_and_test(&cache->count)) {
		WARN_ON(cache->pinned > 0);
		WARN_ON(cache->reserved > 0);
134
		kfree(cache->free_space_ctl);
135
		kfree(cache);
136
	}
137 138
}

J
Josef Bacik 已提交
139 140 141 142
/*
 * this adds the block group to the fs_info rb tree for the block group
 * cache
 */
143
static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
J
Josef Bacik 已提交
144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
				struct btrfs_block_group_cache *block_group)
{
	struct rb_node **p;
	struct rb_node *parent = NULL;
	struct btrfs_block_group_cache *cache;

	spin_lock(&info->block_group_cache_lock);
	p = &info->block_group_cache_tree.rb_node;

	while (*p) {
		parent = *p;
		cache = rb_entry(parent, struct btrfs_block_group_cache,
				 cache_node);
		if (block_group->key.objectid < cache->key.objectid) {
			p = &(*p)->rb_left;
		} else if (block_group->key.objectid > cache->key.objectid) {
			p = &(*p)->rb_right;
		} else {
			spin_unlock(&info->block_group_cache_lock);
			return -EEXIST;
		}
	}

	rb_link_node(&block_group->cache_node, parent, p);
	rb_insert_color(&block_group->cache_node,
			&info->block_group_cache_tree);
170 171 172 173

	if (info->first_logical_byte > block_group->key.objectid)
		info->first_logical_byte = block_group->key.objectid;

J
Josef Bacik 已提交
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
	spin_unlock(&info->block_group_cache_lock);

	return 0;
}

/*
 * This will return the block group at or after bytenr if contains is 0, else
 * it will return the block group that contains the bytenr
 */
static struct btrfs_block_group_cache *
block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
			      int contains)
{
	struct btrfs_block_group_cache *cache, *ret = NULL;
	struct rb_node *n;
	u64 end, start;

	spin_lock(&info->block_group_cache_lock);
	n = info->block_group_cache_tree.rb_node;

	while (n) {
		cache = rb_entry(n, struct btrfs_block_group_cache,
				 cache_node);
		end = cache->key.objectid + cache->key.offset - 1;
		start = cache->key.objectid;

		if (bytenr < start) {
			if (!contains && (!ret || start < ret->key.objectid))
				ret = cache;
			n = n->rb_left;
		} else if (bytenr > start) {
			if (contains && bytenr <= end) {
				ret = cache;
				break;
			}
			n = n->rb_right;
		} else {
			ret = cache;
			break;
		}
	}
215
	if (ret) {
216
		btrfs_get_block_group(ret);
217 218 219
		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
			info->first_logical_byte = ret->key.objectid;
	}
J
Josef Bacik 已提交
220 221 222 223 224
	spin_unlock(&info->block_group_cache_lock);

	return ret;
}

225 226
static int add_excluded_extent(struct btrfs_root *root,
			       u64 start, u64 num_bytes)
J
Josef Bacik 已提交
227
{
228 229
	u64 end = start + num_bytes - 1;
	set_extent_bits(&root->fs_info->freed_extents[0],
230
			start, end, EXTENT_UPTODATE);
231
	set_extent_bits(&root->fs_info->freed_extents[1],
232
			start, end, EXTENT_UPTODATE);
233 234
	return 0;
}
J
Josef Bacik 已提交
235

236 237 238 239
static void free_excluded_extents(struct btrfs_root *root,
				  struct btrfs_block_group_cache *cache)
{
	u64 start, end;
J
Josef Bacik 已提交
240

241 242 243 244
	start = cache->key.objectid;
	end = start + cache->key.offset - 1;

	clear_extent_bits(&root->fs_info->freed_extents[0],
245
			  start, end, EXTENT_UPTODATE);
246
	clear_extent_bits(&root->fs_info->freed_extents[1],
247
			  start, end, EXTENT_UPTODATE);
J
Josef Bacik 已提交
248 249
}

250 251
static int exclude_super_stripes(struct btrfs_root *root,
				 struct btrfs_block_group_cache *cache)
J
Josef Bacik 已提交
252 253 254 255 256 257
{
	u64 bytenr;
	u64 *logical;
	int stripe_len;
	int i, nr, ret;

258 259 260 261 262
	if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
		stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
		cache->bytes_super += stripe_len;
		ret = add_excluded_extent(root, cache->key.objectid,
					  stripe_len);
263 264
		if (ret)
			return ret;
265 266
	}

J
Josef Bacik 已提交
267 268 269 270 271
	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
		bytenr = btrfs_sb_offset(i);
		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
				       cache->key.objectid, bytenr,
				       0, &logical, &nr, &stripe_len);
272 273
		if (ret)
			return ret;
274

J
Josef Bacik 已提交
275
		while (nr--) {
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
			u64 start, len;

			if (logical[nr] > cache->key.objectid +
			    cache->key.offset)
				continue;

			if (logical[nr] + stripe_len <= cache->key.objectid)
				continue;

			start = logical[nr];
			if (start < cache->key.objectid) {
				start = cache->key.objectid;
				len = (logical[nr] + stripe_len) - start;
			} else {
				len = min_t(u64, stripe_len,
					    cache->key.objectid +
					    cache->key.offset - start);
			}

			cache->bytes_super += len;
			ret = add_excluded_extent(root, start, len);
297 298 299 300
			if (ret) {
				kfree(logical);
				return ret;
			}
J
Josef Bacik 已提交
301
		}
302

J
Josef Bacik 已提交
303 304 305 306 307
		kfree(logical);
	}
	return 0;
}

308 309 310 311 312 313
static struct btrfs_caching_control *
get_caching_control(struct btrfs_block_group_cache *cache)
{
	struct btrfs_caching_control *ctl;

	spin_lock(&cache->lock);
314 315
	if (!cache->caching_ctl) {
		spin_unlock(&cache->lock);
316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
		return NULL;
	}

	ctl = cache->caching_ctl;
	atomic_inc(&ctl->count);
	spin_unlock(&cache->lock);
	return ctl;
}

static void put_caching_control(struct btrfs_caching_control *ctl)
{
	if (atomic_dec_and_test(&ctl->count))
		kfree(ctl);
}

331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
#ifdef CONFIG_BTRFS_DEBUG
static void fragment_free_space(struct btrfs_root *root,
				struct btrfs_block_group_cache *block_group)
{
	u64 start = block_group->key.objectid;
	u64 len = block_group->key.offset;
	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
		root->nodesize : root->sectorsize;
	u64 step = chunk << 1;

	while (len > chunk) {
		btrfs_remove_free_space(block_group, start, chunk);
		start += step;
		if (len < step)
			len = 0;
		else
			len -= step;
	}
}
#endif

J
Josef Bacik 已提交
352 353 354 355 356
/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
357 358
u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
		       struct btrfs_fs_info *info, u64 start, u64 end)
J
Josef Bacik 已提交
359
{
J
Josef Bacik 已提交
360
	u64 extent_start, extent_end, size, total_added = 0;
J
Josef Bacik 已提交
361 362 363
	int ret;

	while (start < end) {
364
		ret = find_first_extent_bit(info->pinned_extents, start,
J
Josef Bacik 已提交
365
					    &extent_start, &extent_end,
366 367
					    EXTENT_DIRTY | EXTENT_UPTODATE,
					    NULL);
J
Josef Bacik 已提交
368 369 370
		if (ret)
			break;

371
		if (extent_start <= start) {
J
Josef Bacik 已提交
372 373 374
			start = extent_end + 1;
		} else if (extent_start > start && extent_start < end) {
			size = extent_start - start;
J
Josef Bacik 已提交
375
			total_added += size;
376 377
			ret = btrfs_add_free_space(block_group, start,
						   size);
378
			BUG_ON(ret); /* -ENOMEM or logic error */
J
Josef Bacik 已提交
379 380 381 382 383 384 385 386
			start = extent_end + 1;
		} else {
			break;
		}
	}

	if (start < end) {
		size = end - start;
J
Josef Bacik 已提交
387
		total_added += size;
388
		ret = btrfs_add_free_space(block_group, start, size);
389
		BUG_ON(ret); /* -ENOMEM or logic error */
J
Josef Bacik 已提交
390 391
	}

J
Josef Bacik 已提交
392
	return total_added;
J
Josef Bacik 已提交
393 394
}

O
Omar Sandoval 已提交
395
static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
396
{
397 398 399
	struct btrfs_block_group_cache *block_group;
	struct btrfs_fs_info *fs_info;
	struct btrfs_root *extent_root;
400
	struct btrfs_path *path;
401
	struct extent_buffer *leaf;
402
	struct btrfs_key key;
J
Josef Bacik 已提交
403
	u64 total_found = 0;
404 405
	u64 last = 0;
	u32 nritems;
O
Omar Sandoval 已提交
406
	int ret;
407
	bool wakeup = true;
408

409 410 411 412
	block_group = caching_ctl->block_group;
	fs_info = block_group->fs_info;
	extent_root = fs_info->extent_root;

413 414
	path = btrfs_alloc_path();
	if (!path)
O
Omar Sandoval 已提交
415
		return -ENOMEM;
416

J
Josef Bacik 已提交
417
	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
418

419 420 421 422 423 424 425 426 427
#ifdef CONFIG_BTRFS_DEBUG
	/*
	 * If we're fragmenting we don't want to make anybody think we can
	 * allocate from this block group until we've had a chance to fragment
	 * the free space.
	 */
	if (btrfs_should_fragment_free_space(extent_root, block_group))
		wakeup = false;
#endif
428
	/*
J
Josef Bacik 已提交
429 430 431 432
	 * We don't want to deadlock with somebody trying to allocate a new
	 * extent for the extent root while also trying to search the extent
	 * root to add free space.  So we skip locking and search the commit
	 * root, since its read-only
433 434
	 */
	path->skip_locking = 1;
J
Josef Bacik 已提交
435
	path->search_commit_root = 1;
436
	path->reada = READA_FORWARD;
J
Josef Bacik 已提交
437

Y
Yan Zheng 已提交
438
	key.objectid = last;
439
	key.offset = 0;
440
	key.type = BTRFS_EXTENT_ITEM_KEY;
441

442
next:
443
	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
444
	if (ret < 0)
O
Omar Sandoval 已提交
445
		goto out;
Y
Yan Zheng 已提交
446

447 448 449
	leaf = path->nodes[0];
	nritems = btrfs_header_nritems(leaf);

C
Chris Mason 已提交
450
	while (1) {
451
		if (btrfs_fs_closing(fs_info) > 1) {
452
			last = (u64)-1;
J
Josef Bacik 已提交
453
			break;
454
		}
J
Josef Bacik 已提交
455

456 457 458 459 460
		if (path->slots[0] < nritems) {
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		} else {
			ret = find_next_key(path, 0, &key);
			if (ret)
461
				break;
J
Josef Bacik 已提交
462

463
			if (need_resched() ||
464
			    rwsem_is_contended(&fs_info->commit_root_sem)) {
465 466
				if (wakeup)
					caching_ctl->progress = last;
C
Chris Mason 已提交
467
				btrfs_release_path(path);
468
				up_read(&fs_info->commit_root_sem);
469
				mutex_unlock(&caching_ctl->mutex);
470
				cond_resched();
O
Omar Sandoval 已提交
471 472 473
				mutex_lock(&caching_ctl->mutex);
				down_read(&fs_info->commit_root_sem);
				goto next;
474
			}
475 476 477

			ret = btrfs_next_leaf(extent_root, path);
			if (ret < 0)
O
Omar Sandoval 已提交
478
				goto out;
479 480
			if (ret)
				break;
481 482 483
			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			continue;
484
		}
J
Josef Bacik 已提交
485

486 487 488 489 490
		if (key.objectid < last) {
			key.objectid = last;
			key.offset = 0;
			key.type = BTRFS_EXTENT_ITEM_KEY;

491 492
			if (wakeup)
				caching_ctl->progress = last;
493 494 495 496
			btrfs_release_path(path);
			goto next;
		}

497 498
		if (key.objectid < block_group->key.objectid) {
			path->slots[0]++;
J
Josef Bacik 已提交
499
			continue;
500
		}
J
Josef Bacik 已提交
501

502
		if (key.objectid >= block_group->key.objectid +
J
Josef Bacik 已提交
503
		    block_group->key.offset)
504
			break;
505

506 507
		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
		    key.type == BTRFS_METADATA_ITEM_KEY) {
J
Josef Bacik 已提交
508 509 510
			total_found += add_new_free_space(block_group,
							  fs_info, last,
							  key.objectid);
511 512
			if (key.type == BTRFS_METADATA_ITEM_KEY)
				last = key.objectid +
513
					fs_info->tree_root->nodesize;
514 515
			else
				last = key.objectid + key.offset;
J
Josef Bacik 已提交
516

O
Omar Sandoval 已提交
517
			if (total_found > CACHING_CTL_WAKE_UP) {
518
				total_found = 0;
519 520
				if (wakeup)
					wake_up(&caching_ctl->wait);
521
			}
J
Josef Bacik 已提交
522
		}
523 524
		path->slots[0]++;
	}
J
Josef Bacik 已提交
525
	ret = 0;
526

J
Josef Bacik 已提交
527 528 529
	total_found += add_new_free_space(block_group, fs_info, last,
					  block_group->key.objectid +
					  block_group->key.offset);
530
	caching_ctl->progress = (u64)-1;
J
Josef Bacik 已提交
531

O
Omar Sandoval 已提交
532 533 534 535 536 537 538 539 540 541
out:
	btrfs_free_path(path);
	return ret;
}

static noinline void caching_thread(struct btrfs_work *work)
{
	struct btrfs_block_group_cache *block_group;
	struct btrfs_fs_info *fs_info;
	struct btrfs_caching_control *caching_ctl;
542
	struct btrfs_root *extent_root;
O
Omar Sandoval 已提交
543 544 545 546 547
	int ret;

	caching_ctl = container_of(work, struct btrfs_caching_control, work);
	block_group = caching_ctl->block_group;
	fs_info = block_group->fs_info;
548
	extent_root = fs_info->extent_root;
O
Omar Sandoval 已提交
549 550 551 552

	mutex_lock(&caching_ctl->mutex);
	down_read(&fs_info->commit_root_sem);

553 554 555 556
	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
		ret = load_free_space_tree(caching_ctl);
	else
		ret = load_extent_tree_free(caching_ctl);
O
Omar Sandoval 已提交
557

J
Josef Bacik 已提交
558
	spin_lock(&block_group->lock);
559
	block_group->caching_ctl = NULL;
O
Omar Sandoval 已提交
560
	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
J
Josef Bacik 已提交
561
	spin_unlock(&block_group->lock);
J
Josef Bacik 已提交
562

563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578
#ifdef CONFIG_BTRFS_DEBUG
	if (btrfs_should_fragment_free_space(extent_root, block_group)) {
		u64 bytes_used;

		spin_lock(&block_group->space_info->lock);
		spin_lock(&block_group->lock);
		bytes_used = block_group->key.offset -
			btrfs_block_group_used(&block_group->item);
		block_group->space_info->bytes_used += bytes_used >> 1;
		spin_unlock(&block_group->lock);
		spin_unlock(&block_group->space_info->lock);
		fragment_free_space(extent_root, block_group);
	}
#endif

	caching_ctl->progress = (u64)-1;
579

580
	up_read(&fs_info->commit_root_sem);
O
Omar Sandoval 已提交
581
	free_excluded_extents(fs_info->extent_root, block_group);
582
	mutex_unlock(&caching_ctl->mutex);
O
Omar Sandoval 已提交
583

584 585 586
	wake_up(&caching_ctl->wait);

	put_caching_control(caching_ctl);
587
	btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
588 589
}

590 591
static int cache_block_group(struct btrfs_block_group_cache *cache,
			     int load_cache_only)
J
Josef Bacik 已提交
592
{
593
	DEFINE_WAIT(wait);
594 595
	struct btrfs_fs_info *fs_info = cache->fs_info;
	struct btrfs_caching_control *caching_ctl;
J
Josef Bacik 已提交
596 597
	int ret = 0;

598
	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
599 600
	if (!caching_ctl)
		return -ENOMEM;
601 602 603 604 605 606 607

	INIT_LIST_HEAD(&caching_ctl->list);
	mutex_init(&caching_ctl->mutex);
	init_waitqueue_head(&caching_ctl->wait);
	caching_ctl->block_group = cache;
	caching_ctl->progress = cache->key.objectid;
	atomic_set(&caching_ctl->count, 1);
608 609
	btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
			caching_thread, NULL, NULL);
610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641

	spin_lock(&cache->lock);
	/*
	 * This should be a rare occasion, but this could happen I think in the
	 * case where one thread starts to load the space cache info, and then
	 * some other thread starts a transaction commit which tries to do an
	 * allocation while the other thread is still loading the space cache
	 * info.  The previous loop should have kept us from choosing this block
	 * group, but if we've moved to the state where we will wait on caching
	 * block groups we need to first check if we're doing a fast load here,
	 * so we can wait for it to finish, otherwise we could end up allocating
	 * from a block group who's cache gets evicted for one reason or
	 * another.
	 */
	while (cache->cached == BTRFS_CACHE_FAST) {
		struct btrfs_caching_control *ctl;

		ctl = cache->caching_ctl;
		atomic_inc(&ctl->count);
		prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
		spin_unlock(&cache->lock);

		schedule();

		finish_wait(&ctl->wait, &wait);
		put_caching_control(ctl);
		spin_lock(&cache->lock);
	}

	if (cache->cached != BTRFS_CACHE_NO) {
		spin_unlock(&cache->lock);
		kfree(caching_ctl);
642
		return 0;
643 644 645 646 647
	}
	WARN_ON(cache->caching_ctl);
	cache->caching_ctl = caching_ctl;
	cache->cached = BTRFS_CACHE_FAST;
	spin_unlock(&cache->lock);
648

649
	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
650
		mutex_lock(&caching_ctl->mutex);
651 652 653 654
		ret = load_free_space_cache(fs_info, cache);

		spin_lock(&cache->lock);
		if (ret == 1) {
655
			cache->caching_ctl = NULL;
656 657
			cache->cached = BTRFS_CACHE_FINISHED;
			cache->last_byte_to_unpin = (u64)-1;
658
			caching_ctl->progress = (u64)-1;
659
		} else {
660 661 662 663 664
			if (load_cache_only) {
				cache->caching_ctl = NULL;
				cache->cached = BTRFS_CACHE_NO;
			} else {
				cache->cached = BTRFS_CACHE_STARTED;
665
				cache->has_caching_ctl = 1;
666
			}
667 668
		}
		spin_unlock(&cache->lock);
669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684
#ifdef CONFIG_BTRFS_DEBUG
		if (ret == 1 &&
		    btrfs_should_fragment_free_space(fs_info->extent_root,
						     cache)) {
			u64 bytes_used;

			spin_lock(&cache->space_info->lock);
			spin_lock(&cache->lock);
			bytes_used = cache->key.offset -
				btrfs_block_group_used(&cache->item);
			cache->space_info->bytes_used += bytes_used >> 1;
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);
			fragment_free_space(fs_info->extent_root, cache);
		}
#endif
685 686
		mutex_unlock(&caching_ctl->mutex);

687
		wake_up(&caching_ctl->wait);
688
		if (ret == 1) {
689
			put_caching_control(caching_ctl);
690
			free_excluded_extents(fs_info->extent_root, cache);
691
			return 0;
692
		}
693 694
	} else {
		/*
695 696
		 * We're either using the free space tree or no caching at all.
		 * Set cached to the appropriate value and wakeup any waiters.
697 698 699 700 701 702 703
		 */
		spin_lock(&cache->lock);
		if (load_cache_only) {
			cache->caching_ctl = NULL;
			cache->cached = BTRFS_CACHE_NO;
		} else {
			cache->cached = BTRFS_CACHE_STARTED;
704
			cache->has_caching_ctl = 1;
705 706 707
		}
		spin_unlock(&cache->lock);
		wake_up(&caching_ctl->wait);
708 709
	}

710 711
	if (load_cache_only) {
		put_caching_control(caching_ctl);
712
		return 0;
J
Josef Bacik 已提交
713 714
	}

715
	down_write(&fs_info->commit_root_sem);
716
	atomic_inc(&caching_ctl->count);
717
	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
718
	up_write(&fs_info->commit_root_sem);
719

720
	btrfs_get_block_group(cache);
721

722
	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
J
Josef Bacik 已提交
723

724
	return ret;
725 726
}

J
Josef Bacik 已提交
727 728 729
/*
 * return the block group that starts at or after bytenr
 */
C
Chris Mason 已提交
730 731
static struct btrfs_block_group_cache *
btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
C
Chris Mason 已提交
732
{
J
Josef Bacik 已提交
733
	struct btrfs_block_group_cache *cache;
C
Chris Mason 已提交
734

J
Josef Bacik 已提交
735
	cache = block_group_cache_tree_search(info, bytenr, 0);
C
Chris Mason 已提交
736

J
Josef Bacik 已提交
737
	return cache;
C
Chris Mason 已提交
738 739
}

J
Josef Bacik 已提交
740
/*
741
 * return the block group that contains the given bytenr
J
Josef Bacik 已提交
742
 */
C
Chris Mason 已提交
743 744 745
struct btrfs_block_group_cache *btrfs_lookup_block_group(
						 struct btrfs_fs_info *info,
						 u64 bytenr)
C
Chris Mason 已提交
746
{
J
Josef Bacik 已提交
747
	struct btrfs_block_group_cache *cache;
C
Chris Mason 已提交
748

J
Josef Bacik 已提交
749
	cache = block_group_cache_tree_search(info, bytenr, 1);
750

J
Josef Bacik 已提交
751
	return cache;
C
Chris Mason 已提交
752
}
753

J
Josef Bacik 已提交
754 755
static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
						  u64 flags)
756
{
J
Josef Bacik 已提交
757 758
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;
759

760
	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
761

762 763
	rcu_read_lock();
	list_for_each_entry_rcu(found, head, list) {
764
		if (found->flags & flags) {
765
			rcu_read_unlock();
J
Josef Bacik 已提交
766
			return found;
767
		}
J
Josef Bacik 已提交
768
	}
769
	rcu_read_unlock();
J
Josef Bacik 已提交
770
	return NULL;
771 772
}

773 774 775 776 777 778 779 780 781 782 783 784 785 786 787
/*
 * after adding space to the filesystem, we need to clear the full flags
 * on all the space infos.
 */
void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
{
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;

	rcu_read_lock();
	list_for_each_entry_rcu(found, head, list)
		found->full = 0;
	rcu_read_unlock();
}

788 789
/* simple helper to search for an existing data extent at a given offset */
int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len)
790 791 792
{
	int ret;
	struct btrfs_key key;
Z
Zheng Yan 已提交
793
	struct btrfs_path *path;
794

Z
Zheng Yan 已提交
795
	path = btrfs_alloc_path();
796 797 798
	if (!path)
		return -ENOMEM;

799 800
	key.objectid = start;
	key.offset = len;
801
	key.type = BTRFS_EXTENT_ITEM_KEY;
802 803
	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
				0, 0);
Z
Zheng Yan 已提交
804
	btrfs_free_path(path);
805 806 807
	return ret;
}

808
/*
809
 * helper function to lookup reference count and flags of a tree block.
810 811 812 813 814 815 816 817 818
 *
 * the head node for delayed ref is used to store the sum of all the
 * reference count modifications queued up in the rbtree. the head
 * node may also store the extent flags to set. This way you can check
 * to see what the reference count and extent flags would be if all of
 * the delayed refs are not processed.
 */
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
			     struct btrfs_root *root, u64 bytenr,
819
			     u64 offset, int metadata, u64 *refs, u64 *flags)
820 821 822 823 824 825 826 827 828 829 830 831
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	u32 item_size;
	u64 num_refs;
	u64 extent_flags;
	int ret;

832 833 834 835 836
	/*
	 * If we don't have skinny metadata, don't bother doing anything
	 * different
	 */
	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
837
		offset = root->nodesize;
838 839 840
		metadata = 0;
	}

841 842 843 844 845 846 847 848
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	if (!trans) {
		path->skip_locking = 1;
		path->search_commit_root = 1;
	}
849 850 851 852 853 854 855 856 857

search_again:
	key.objectid = bytenr;
	key.offset = offset;
	if (metadata)
		key.type = BTRFS_METADATA_ITEM_KEY;
	else
		key.type = BTRFS_EXTENT_ITEM_KEY;

858 859 860 861 862
	ret = btrfs_search_slot(trans, root->fs_info->extent_root,
				&key, path, 0, 0);
	if (ret < 0)
		goto out_free;

863
	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
864 865 866 867 868 869
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
870
			    key.offset == root->nodesize)
871 872
				ret = 0;
		}
873 874
	}

875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
	if (ret == 0) {
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		if (item_size >= sizeof(*ei)) {
			ei = btrfs_item_ptr(leaf, path->slots[0],
					    struct btrfs_extent_item);
			num_refs = btrfs_extent_refs(leaf, ei);
			extent_flags = btrfs_extent_flags(leaf, ei);
		} else {
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
			struct btrfs_extent_item_v0 *ei0;
			BUG_ON(item_size != sizeof(*ei0));
			ei0 = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_item_v0);
			num_refs = btrfs_extent_refs_v0(leaf, ei0);
			/* FIXME: this isn't correct for data */
			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
#else
			BUG();
#endif
		}
		BUG_ON(num_refs == 0);
	} else {
		num_refs = 0;
		extent_flags = 0;
		ret = 0;
	}

	if (!trans)
		goto out;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
	head = btrfs_find_delayed_ref_head(trans, bytenr);
	if (head) {
		if (!mutex_trylock(&head->mutex)) {
			atomic_inc(&head->node.refs);
			spin_unlock(&delayed_refs->lock);

914
			btrfs_release_path(path);
915

916 917 918 919
			/*
			 * Mutex was contended, block until it's released and try
			 * again
			 */
920 921 922
			mutex_lock(&head->mutex);
			mutex_unlock(&head->mutex);
			btrfs_put_delayed_ref(&head->node);
923
			goto search_again;
924
		}
925
		spin_lock(&head->lock);
926 927 928 929 930 931
		if (head->extent_op && head->extent_op->update_flags)
			extent_flags |= head->extent_op->flags_to_set;
		else
			BUG_ON(num_refs == 0);

		num_refs += head->node.ref_mod;
932
		spin_unlock(&head->lock);
933 934 935 936 937 938 939 940 941 942 943 944 945 946
		mutex_unlock(&head->mutex);
	}
	spin_unlock(&delayed_refs->lock);
out:
	WARN_ON(num_refs == 0);
	if (refs)
		*refs = num_refs;
	if (flags)
		*flags = extent_flags;
out_free:
	btrfs_free_path(path);
	return ret;
}

947 948 949 950 951 952 953 954 955 956 957 958 959 960
/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
 *    when a reference is dropped we can make sure it was a valid reference
 *    before freeing the extent.
 *
 * 2) Provide enough information to quickly find the holders of an extent
 *    if we notice a given block is corrupted or bad.
 *
 * 3) Make it easy to migrate blocks for FS shrinking or storage pool
 *    maintenance.  This is actually the same as #2, but with a slightly
 *    different use case.
 *
961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978
 * There are two kinds of back refs. The implicit back refs is optimized
 * for pointers in non-shared tree blocks. For a given pointer in a block,
 * back refs of this kind provide information about the block's owner tree
 * and the pointer's key. These information allow us to find the block by
 * b-tree searching. The full back refs is for pointers in tree blocks not
 * referenced by their owner trees. The location of tree block is recorded
 * in the back refs. Actually the full back refs is generic, and can be
 * used in all cases the implicit back refs is used. The major shortcoming
 * of the full back refs is its overhead. Every time a tree block gets
 * COWed, we have to update back refs entry for all pointers in it.
 *
 * For a newly allocated tree block, we use implicit back refs for
 * pointers in it. This means most tree related operations only involve
 * implicit back refs. For a tree block created in old transaction, the
 * only way to drop a reference to it is COW it. So we can detect the
 * event that tree block loses its owner tree's reference and do the
 * back refs conversion.
 *
979
 * When a tree block is COWed through a tree, there are four cases:
980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
 *
 * The reference count of the block is one and the tree is the block's
 * owner tree. Nothing to do in this case.
 *
 * The reference count of the block is one and the tree is not the
 * block's owner tree. In this case, full back refs is used for pointers
 * in the block. Remove these full back refs, add implicit back refs for
 * every pointers in the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * the block's owner tree. In this case, implicit back refs is used for
 * pointers in the block. Add full back refs for every pointers in the
 * block, increase lower level extents' reference counts. The original
 * implicit back refs are entailed to the new block.
 *
 * The reference count of the block is greater than one and the tree is
 * not the block's owner tree. Add implicit back refs for every pointer in
 * the new block, increase lower level extents' reference count.
 *
 * Back Reference Key composing:
 *
 * The key objectid corresponds to the first byte in the extent,
 * The key type is used to differentiate between types of back refs.
 * There are different meanings of the key offset for different types
 * of back refs.
 *
1006 1007 1008
 * File extents can be referenced by:
 *
 * - multiple snapshots, subvolumes, or different generations in one subvol
Z
Zheng Yan 已提交
1009
 * - different files inside a single subvolume
1010 1011
 * - different offsets inside a file (bookend extents in file.c)
 *
1012
 * The extent ref structure for the implicit back refs has fields for:
1013 1014 1015
 *
 * - Objectid of the subvolume root
 * - objectid of the file holding the reference
1016 1017
 * - original offset in the file
 * - how many bookend extents
1018
 *
1019 1020
 * The key offset for the implicit back refs is hash of the first
 * three fields.
1021
 *
1022
 * The extent ref structure for the full back refs has field for:
1023
 *
1024
 * - number of pointers in the tree leaf
1025
 *
1026 1027
 * The key offset for the implicit back refs is the first byte of
 * the tree leaf
1028
 *
1029 1030
 * When a file extent is allocated, The implicit back refs is used.
 * the fields are filled in:
1031
 *
1032
 *     (root_key.objectid, inode objectid, offset in file, 1)
1033
 *
1034 1035
 * When a file extent is removed file truncation, we find the
 * corresponding implicit back refs and check the following fields:
1036
 *
1037
 *     (btrfs_header_owner(leaf), inode objectid, offset in file)
1038
 *
1039
 * Btree extents can be referenced by:
1040
 *
1041
 * - Different subvolumes
1042
 *
1043 1044 1045 1046
 * Both the implicit back refs and the full back refs for tree blocks
 * only consist of key. The key offset for the implicit back refs is
 * objectid of block's owner tree. The key offset for the full back refs
 * is the first byte of parent block.
1047
 *
1048 1049 1050
 * When implicit back refs is used, information about the lowest key and
 * level of the tree block are required. These information are stored in
 * tree block info structure.
1051
 */
Z
Zheng Yan 已提交
1052

1053 1054 1055 1056 1057
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  struct btrfs_path *path,
				  u64 owner, u32 extra_size)
1058
{
1059 1060 1061 1062 1063
	struct btrfs_extent_item *item;
	struct btrfs_extent_item_v0 *ei0;
	struct btrfs_extent_ref_v0 *ref0;
	struct btrfs_tree_block_info *bi;
	struct extent_buffer *leaf;
1064
	struct btrfs_key key;
1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083
	struct btrfs_key found_key;
	u32 new_size = sizeof(*item);
	u64 refs;
	int ret;

	leaf = path->nodes[0];
	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));

	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
	ei0 = btrfs_item_ptr(leaf, path->slots[0],
			     struct btrfs_extent_item_v0);
	refs = btrfs_extent_refs_v0(leaf, ei0);

	if (owner == (u64)-1) {
		while (1) {
			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
				ret = btrfs_next_leaf(root, path);
				if (ret < 0)
					return ret;
1084
				BUG_ON(ret > 0); /* Corruption */
1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099
				leaf = path->nodes[0];
			}
			btrfs_item_key_to_cpu(leaf, &found_key,
					      path->slots[0]);
			BUG_ON(key.objectid != found_key.objectid);
			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
				path->slots[0]++;
				continue;
			}
			ref0 = btrfs_item_ptr(leaf, path->slots[0],
					      struct btrfs_extent_ref_v0);
			owner = btrfs_ref_objectid_v0(leaf, ref0);
			break;
		}
	}
1100
	btrfs_release_path(path);
1101 1102 1103 1104 1105 1106 1107 1108 1109

	if (owner < BTRFS_FIRST_FREE_OBJECTID)
		new_size += sizeof(*bi);

	new_size -= sizeof(*ei0);
	ret = btrfs_search_slot(trans, root, &key, path,
				new_size + extra_size, 1);
	if (ret < 0)
		return ret;
1110
	BUG_ON(ret); /* Corruption */
1111

1112
	btrfs_extend_item(root, path, new_size);
1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, item, refs);
	/* FIXME: get real generation */
	btrfs_set_extent_generation(leaf, item, 0);
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		btrfs_set_extent_flags(leaf, item,
				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
		bi = (struct btrfs_tree_block_info *)(item + 1);
		/* FIXME: get first key of the block */
		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
		btrfs_set_tree_block_level(leaf, bi, (int)owner);
	} else {
		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
	}
	btrfs_mark_buffer_dirty(leaf);
	return 0;
}
#endif

static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
{
	u32 high_crc = ~(u32)0;
	u32 low_crc = ~(u32)0;
	__le64 lenum;

	lenum = cpu_to_le64(root_objectid);
1142
	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
1143
	lenum = cpu_to_le64(owner);
1144
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1145
	lenum = cpu_to_le64(offset);
1146
	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178

	return ((u64)high_crc << 31) ^ (u64)low_crc;
}

static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
				     struct btrfs_extent_data_ref *ref)
{
	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
				    btrfs_extent_data_ref_objectid(leaf, ref),
				    btrfs_extent_data_ref_offset(leaf, ref));
}

static int match_extent_data_ref(struct extent_buffer *leaf,
				 struct btrfs_extent_data_ref *ref,
				 u64 root_objectid, u64 owner, u64 offset)
{
	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		return 0;
	return 1;
}

static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_root *root,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid,
					   u64 owner, u64 offset)
{
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref;
Z
Zheng Yan 已提交
1179
	struct extent_buffer *leaf;
1180
	u32 nritems;
1181
	int ret;
1182 1183
	int recow;
	int err = -ENOENT;
1184

Z
Zheng Yan 已提交
1185
	key.objectid = bytenr;
1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
	}
again:
	recow = 0;
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0) {
		err = ret;
		goto fail;
	}
Z
Zheng Yan 已提交
1201

1202 1203 1204 1205 1206
	if (parent) {
		if (!ret)
			return 0;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
		key.type = BTRFS_EXTENT_REF_V0_KEY;
1207
		btrfs_release_path(path);
1208 1209 1210 1211 1212 1213 1214 1215 1216
		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
		if (ret < 0) {
			err = ret;
			goto fail;
		}
		if (!ret)
			return 0;
#endif
		goto fail;
Z
Zheng Yan 已提交
1217 1218 1219
	}

	leaf = path->nodes[0];
1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244
	nritems = btrfs_header_nritems(leaf);
	while (1) {
		if (path->slots[0] >= nritems) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				err = ret;
			if (ret)
				goto fail;

			leaf = path->nodes[0];
			nritems = btrfs_header_nritems(leaf);
			recow = 1;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != bytenr ||
		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
			goto fail;

		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);

		if (match_extent_data_ref(leaf, ref, root_objectid,
					  owner, offset)) {
			if (recow) {
1245
				btrfs_release_path(path);
1246 1247 1248 1249 1250 1251
				goto again;
			}
			err = 0;
			break;
		}
		path->slots[0]++;
Z
Zheng Yan 已提交
1252
	}
1253 1254
fail:
	return err;
Z
Zheng Yan 已提交
1255 1256
}

1257 1258 1259 1260 1261 1262
static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_root *root,
					   struct btrfs_path *path,
					   u64 bytenr, u64 parent,
					   u64 root_objectid, u64 owner,
					   u64 offset, int refs_to_add)
Z
Zheng Yan 已提交
1263 1264 1265
{
	struct btrfs_key key;
	struct extent_buffer *leaf;
1266
	u32 size;
Z
Zheng Yan 已提交
1267 1268
	u32 num_refs;
	int ret;
1269 1270

	key.objectid = bytenr;
1271 1272 1273 1274 1275 1276 1277 1278 1279 1280
	if (parent) {
		key.type = BTRFS_SHARED_DATA_REF_KEY;
		key.offset = parent;
		size = sizeof(struct btrfs_shared_data_ref);
	} else {
		key.type = BTRFS_EXTENT_DATA_REF_KEY;
		key.offset = hash_extent_data_ref(root_objectid,
						  owner, offset);
		size = sizeof(struct btrfs_extent_data_ref);
	}
1281

1282 1283 1284 1285 1286 1287 1288
	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
	if (ret && ret != -EEXIST)
		goto fail;

	leaf = path->nodes[0];
	if (parent) {
		struct btrfs_shared_data_ref *ref;
Z
Zheng Yan 已提交
1289
		ref = btrfs_item_ptr(leaf, path->slots[0],
1290 1291 1292 1293 1294 1295 1296
				     struct btrfs_shared_data_ref);
		if (ret == 0) {
			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_shared_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
1297
		}
1298 1299 1300 1301 1302 1303 1304 1305
	} else {
		struct btrfs_extent_data_ref *ref;
		while (ret == -EEXIST) {
			ref = btrfs_item_ptr(leaf, path->slots[0],
					     struct btrfs_extent_data_ref);
			if (match_extent_data_ref(leaf, ref, root_objectid,
						  owner, offset))
				break;
1306
			btrfs_release_path(path);
1307 1308 1309 1310 1311
			key.offset++;
			ret = btrfs_insert_empty_item(trans, root, path, &key,
						      size);
			if (ret && ret != -EEXIST)
				goto fail;
Z
Zheng Yan 已提交
1312

1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326
			leaf = path->nodes[0];
		}
		ref = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_data_ref);
		if (ret == 0) {
			btrfs_set_extent_data_ref_root(leaf, ref,
						       root_objectid);
			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
		} else {
			num_refs = btrfs_extent_data_ref_count(leaf, ref);
			num_refs += refs_to_add;
			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
Z
Zheng Yan 已提交
1327 1328
		}
	}
1329 1330 1331
	btrfs_mark_buffer_dirty(leaf);
	ret = 0;
fail:
1332
	btrfs_release_path(path);
1333
	return ret;
1334 1335
}

1336 1337 1338
static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
					   struct btrfs_root *root,
					   struct btrfs_path *path,
J
Josef Bacik 已提交
1339
					   int refs_to_drop, int *last_ref)
Z
Zheng Yan 已提交
1340
{
1341 1342 1343
	struct btrfs_key key;
	struct btrfs_extent_data_ref *ref1 = NULL;
	struct btrfs_shared_data_ref *ref2 = NULL;
Z
Zheng Yan 已提交
1344
	struct extent_buffer *leaf;
1345
	u32 num_refs = 0;
Z
Zheng Yan 已提交
1346 1347 1348
	int ret = 0;

	leaf = path->nodes[0];
1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
		struct btrfs_extent_ref_v0 *ref0;
		ref0 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_ref_v0);
		num_refs = btrfs_ref_count_v0(leaf, ref0);
#endif
	} else {
		BUG();
	}

1370 1371
	BUG_ON(num_refs < refs_to_drop);
	num_refs -= refs_to_drop;
1372

Z
Zheng Yan 已提交
1373 1374
	if (num_refs == 0) {
		ret = btrfs_del_item(trans, root, path);
J
Josef Bacik 已提交
1375
		*last_ref = 1;
Z
Zheng Yan 已提交
1376
	} else {
1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
		else {
			struct btrfs_extent_ref_v0 *ref0;
			ref0 = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_extent_ref_v0);
			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
		}
#endif
Z
Zheng Yan 已提交
1389 1390 1391 1392 1393
		btrfs_mark_buffer_dirty(leaf);
	}
	return ret;
}

1394
static noinline u32 extent_data_ref_count(struct btrfs_path *path,
1395
					  struct btrfs_extent_inline_ref *iref)
1396
{
1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_data_ref *ref1;
	struct btrfs_shared_data_ref *ref2;
	u32 num_refs = 0;

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
	if (iref) {
		if (btrfs_extent_inline_ref_type(leaf, iref) ==
		    BTRFS_EXTENT_DATA_REF_KEY) {
			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
		} else {
			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
		}
	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
		ref1 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_data_ref);
		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
		ref2 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_shared_data_ref);
		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
		struct btrfs_extent_ref_v0 *ref0;
		ref0 = btrfs_item_ptr(leaf, path->slots[0],
				      struct btrfs_extent_ref_v0);
		num_refs = btrfs_ref_count_v0(leaf, ref0);
C
Chris Mason 已提交
1428
#endif
1429 1430 1431 1432 1433
	} else {
		WARN_ON(1);
	}
	return num_refs;
}
1434

1435 1436 1437 1438 1439
static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
1440
{
1441
	struct btrfs_key key;
1442 1443
	int ret;

1444 1445 1446 1447 1448 1449 1450
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
1451 1452
	}

1453 1454 1455 1456 1457
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -ENOENT;
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (ret == -ENOENT && parent) {
1458
		btrfs_release_path(path);
1459 1460 1461 1462 1463
		key.type = BTRFS_EXTENT_REF_V0_KEY;
		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
		if (ret > 0)
			ret = -ENOENT;
	}
1464
#endif
1465
	return ret;
1466 1467
}

1468 1469 1470 1471 1472
static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path,
					  u64 bytenr, u64 parent,
					  u64 root_objectid)
Z
Zheng Yan 已提交
1473
{
1474
	struct btrfs_key key;
Z
Zheng Yan 已提交
1475 1476
	int ret;

1477 1478 1479 1480 1481 1482 1483 1484 1485 1486
	key.objectid = bytenr;
	if (parent) {
		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
		key.offset = parent;
	} else {
		key.type = BTRFS_TREE_BLOCK_REF_KEY;
		key.offset = root_objectid;
	}

	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1487
	btrfs_release_path(path);
Z
Zheng Yan 已提交
1488 1489 1490
	return ret;
}

1491
static inline int extent_ref_type(u64 parent, u64 owner)
Z
Zheng Yan 已提交
1492
{
1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505
	int type;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (parent > 0)
			type = BTRFS_SHARED_BLOCK_REF_KEY;
		else
			type = BTRFS_TREE_BLOCK_REF_KEY;
	} else {
		if (parent > 0)
			type = BTRFS_SHARED_DATA_REF_KEY;
		else
			type = BTRFS_EXTENT_DATA_REF_KEY;
	}
	return type;
Z
Zheng Yan 已提交
1506
}
1507

1508 1509
static int find_next_key(struct btrfs_path *path, int level,
			 struct btrfs_key *key)
1510

C
Chris Mason 已提交
1511
{
1512
	for (; level < BTRFS_MAX_LEVEL; level++) {
1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
		if (!path->nodes[level])
			break;
		if (path->slots[level] + 1 >=
		    btrfs_header_nritems(path->nodes[level]))
			continue;
		if (level == 0)
			btrfs_item_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		else
			btrfs_node_key_to_cpu(path->nodes[level], key,
					      path->slots[level] + 1);
		return 0;
	}
	return 1;
}
C
Chris Mason 已提交
1528

1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
/*
 * look for inline back ref. if back ref is found, *ref_ret is set
 * to the address of inline back ref, and 0 is returned.
 *
 * if back ref isn't found, *ref_ret is set to the address where it
 * should be inserted, and -ENOENT is returned.
 *
 * if insert is true and there are too many inline back refs, the path
 * points to the extent item, and -EAGAIN is returned.
 *
 * NOTE: inline back refs are ordered in the same way that back ref
 *	 items in the tree are ordered.
 */
static noinline_for_stack
int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int insert)
{
	struct btrfs_key key;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
	u64 flags;
	u64 item_size;
	unsigned long ptr;
	unsigned long end;
	int extra_size;
	int type;
	int want;
	int ret;
	int err = 0;
1564 1565
	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
						 SKINNY_METADATA);
1566

1567
	key.objectid = bytenr;
Z
Zheng Yan 已提交
1568
	key.type = BTRFS_EXTENT_ITEM_KEY;
1569
	key.offset = num_bytes;
Z
Zheng Yan 已提交
1570

1571 1572 1573
	want = extent_ref_type(parent, owner);
	if (insert) {
		extra_size = btrfs_extent_inline_ref_size(want);
1574
		path->keep_locks = 1;
1575 1576
	} else
		extra_size = -1;
1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587

	/*
	 * Owner is our parent level, so we can just add one to get the level
	 * for the block we are interested in.
	 */
	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
		key.type = BTRFS_METADATA_ITEM_KEY;
		key.offset = owner;
	}

again:
1588
	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1589
	if (ret < 0) {
1590 1591 1592
		err = ret;
		goto out;
	}
1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609

	/*
	 * We may be a newly converted file system which still has the old fat
	 * extent entries for metadata, so try and see if we have one of those.
	 */
	if (ret > 0 && skinny_metadata) {
		skinny_metadata = false;
		if (path->slots[0]) {
			path->slots[0]--;
			btrfs_item_key_to_cpu(path->nodes[0], &key,
					      path->slots[0]);
			if (key.objectid == bytenr &&
			    key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes)
				ret = 0;
		}
		if (ret) {
1610
			key.objectid = bytenr;
1611 1612 1613 1614 1615 1616 1617
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;
			btrfs_release_path(path);
			goto again;
		}
	}

1618 1619 1620
	if (ret && !insert) {
		err = -ENOENT;
		goto out;
1621
	} else if (WARN_ON(ret)) {
1622 1623
		err = -EIO;
		goto out;
1624
	}
1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (item_size < sizeof(*ei)) {
		if (!insert) {
			err = -ENOENT;
			goto out;
		}
		ret = convert_extent_item_v0(trans, root, path, owner,
					     extra_size);
		if (ret < 0) {
			err = ret;
			goto out;
		}
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	}
#endif
	BUG_ON(item_size < sizeof(*ei));

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	flags = btrfs_extent_flags(leaf, ei);

	ptr = (unsigned long)(ei + 1);
	end = (unsigned long)ei + item_size;

1652
	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
		ptr += sizeof(struct btrfs_tree_block_info);
		BUG_ON(ptr > end);
	}

	err = -ENOENT;
	while (1) {
		if (ptr >= end) {
			WARN_ON(ptr > end);
			break;
		}
		iref = (struct btrfs_extent_inline_ref *)ptr;
		type = btrfs_extent_inline_ref_type(leaf, iref);
		if (want < type)
			break;
		if (want > type) {
			ptr += btrfs_extent_inline_ref_size(type);
			continue;
		}

		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
			struct btrfs_extent_data_ref *dref;
			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
			if (match_extent_data_ref(leaf, dref, root_objectid,
						  owner, offset)) {
				err = 0;
				break;
			}
			if (hash_extent_data_ref_item(leaf, dref) <
			    hash_extent_data_ref(root_objectid, owner, offset))
				break;
		} else {
			u64 ref_offset;
			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
			if (parent > 0) {
				if (parent == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < parent)
					break;
			} else {
				if (root_objectid == ref_offset) {
					err = 0;
					break;
				}
				if (ref_offset < root_objectid)
					break;
			}
		}
		ptr += btrfs_extent_inline_ref_size(type);
	}
	if (err == -ENOENT && insert) {
		if (item_size + extra_size >=
		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
			err = -EAGAIN;
			goto out;
		}
		/*
		 * To add new inline back ref, we have to make sure
		 * there is no corresponding back ref item.
		 * For simplicity, we just do not add new inline back
		 * ref if there is any kind of item for this block
		 */
1716 1717
		if (find_next_key(path, 0, &key) == 0 &&
		    key.objectid == bytenr &&
1718
		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1719 1720 1721 1722 1723 1724
			err = -EAGAIN;
			goto out;
		}
	}
	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
1725
	if (insert) {
1726 1727 1728 1729 1730 1731 1732 1733 1734 1735
		path->keep_locks = 0;
		btrfs_unlock_up_safe(path, 1);
	}
	return err;
}

/*
 * helper to add new inline back ref
 */
static noinline_for_stack
1736
void setup_inline_extent_backref(struct btrfs_root *root,
1737 1738 1739 1740 1741
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
				 u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	unsigned long ptr;
	unsigned long end;
	unsigned long item_offset;
	u64 refs;
	int size;
	int type;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	item_offset = (unsigned long)iref - (unsigned long)ei;

	type = extent_ref_type(parent, owner);
	size = btrfs_extent_inline_ref_size(type);

1759
	btrfs_extend_item(root, path, size);
1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808

	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	refs += refs_to_add;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	ptr = (unsigned long)ei + item_offset;
	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
	if (ptr < end - size)
		memmove_extent_buffer(leaf, ptr + size, ptr,
				      end - size - ptr);

	iref = (struct btrfs_extent_inline_ref *)ptr;
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		struct btrfs_extent_data_ref *dref;
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		struct btrfs_shared_data_ref *sref;
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}
	btrfs_mark_buffer_dirty(leaf);
}

static int lookup_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref **ref_ret,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner, u64 offset)
{
	int ret;

	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
					   bytenr, num_bytes, parent,
					   root_objectid, owner, offset, 0);
	if (ret != -ENOENT)
1809
		return ret;
1810

1811
	btrfs_release_path(path);
1812 1813 1814 1815 1816 1817 1818 1819
	*ref_ret = NULL;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
					    root_objectid);
	} else {
		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
					     root_objectid, owner, offset);
1820
	}
1821 1822
	return ret;
}
Z
Zheng Yan 已提交
1823

1824 1825 1826 1827
/*
 * helper to update/remove inline back ref
 */
static noinline_for_stack
1828
void update_inline_extent_backref(struct btrfs_root *root,
1829 1830 1831
				  struct btrfs_path *path,
				  struct btrfs_extent_inline_ref *iref,
				  int refs_to_mod,
J
Josef Bacik 已提交
1832 1833
				  struct btrfs_delayed_extent_op *extent_op,
				  int *last_ref)
1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865
{
	struct extent_buffer *leaf;
	struct btrfs_extent_item *ei;
	struct btrfs_extent_data_ref *dref = NULL;
	struct btrfs_shared_data_ref *sref = NULL;
	unsigned long ptr;
	unsigned long end;
	u32 item_size;
	int size;
	int type;
	u64 refs;

	leaf = path->nodes[0];
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, ei);
	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
	refs += refs_to_mod;
	btrfs_set_extent_refs(leaf, ei, refs);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, ei);

	type = btrfs_extent_inline_ref_type(leaf, iref);

	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
		refs = btrfs_extent_data_ref_count(leaf, dref);
	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
		sref = (struct btrfs_shared_data_ref *)(iref + 1);
		refs = btrfs_shared_data_ref_count(leaf, sref);
	} else {
		refs = 1;
		BUG_ON(refs_to_mod != -1);
1866
	}
Z
Zheng Yan 已提交
1867

1868 1869 1870 1871 1872 1873 1874 1875 1876
	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
	refs += refs_to_mod;

	if (refs > 0) {
		if (type == BTRFS_EXTENT_DATA_REF_KEY)
			btrfs_set_extent_data_ref_count(leaf, dref, refs);
		else
			btrfs_set_shared_data_ref_count(leaf, sref, refs);
	} else {
J
Josef Bacik 已提交
1877
		*last_ref = 1;
1878 1879 1880 1881 1882 1883 1884 1885
		size =  btrfs_extent_inline_ref_size(type);
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
		ptr = (unsigned long)iref;
		end = (unsigned long)ei + item_size;
		if (ptr + size < end)
			memmove_extent_buffer(leaf, ptr, ptr + size,
					      end - ptr - size);
		item_size -= size;
1886
		btrfs_truncate_item(root, path, item_size, 1);
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907
	}
	btrfs_mark_buffer_dirty(leaf);
}

static noinline_for_stack
int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 u64 bytenr, u64 num_bytes, u64 parent,
				 u64 root_objectid, u64 owner,
				 u64 offset, int refs_to_add,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_extent_inline_ref *iref;
	int ret;

	ret = lookup_inline_extent_backref(trans, root, path, &iref,
					   bytenr, num_bytes, parent,
					   root_objectid, owner, offset, 1);
	if (ret == 0) {
		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1908
		update_inline_extent_backref(root, path, iref,
J
Josef Bacik 已提交
1909
					     refs_to_add, extent_op, NULL);
1910
	} else if (ret == -ENOENT) {
1911
		setup_inline_extent_backref(root, path, iref, parent,
1912 1913 1914
					    root_objectid, owner, offset,
					    refs_to_add, extent_op);
		ret = 0;
1915
	}
1916 1917
	return ret;
}
Z
Zheng Yan 已提交
1918

1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936
static int insert_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 u64 bytenr, u64 parent, u64 root_objectid,
				 u64 owner, u64 offset, int refs_to_add)
{
	int ret;
	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		BUG_ON(refs_to_add != 1);
		ret = insert_tree_block_ref(trans, root, path, bytenr,
					    parent, root_objectid);
	} else {
		ret = insert_extent_data_ref(trans, root, path, bytenr,
					     parent, root_objectid,
					     owner, offset, refs_to_add);
	}
	return ret;
}
1937

1938 1939 1940 1941
static int remove_extent_backref(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct btrfs_extent_inline_ref *iref,
J
Josef Bacik 已提交
1942
				 int refs_to_drop, int is_data, int *last_ref)
1943
{
1944
	int ret = 0;
1945

1946 1947
	BUG_ON(!is_data && refs_to_drop != 1);
	if (iref) {
1948
		update_inline_extent_backref(root, path, iref,
J
Josef Bacik 已提交
1949
					     -refs_to_drop, NULL, last_ref);
1950
	} else if (is_data) {
J
Josef Bacik 已提交
1951 1952
		ret = remove_extent_data_ref(trans, root, path, refs_to_drop,
					     last_ref);
1953
	} else {
J
Josef Bacik 已提交
1954
		*last_ref = 1;
1955 1956 1957 1958 1959
		ret = btrfs_del_item(trans, root, path);
	}
	return ret;
}

1960
#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + (len))
1961 1962
static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
			       u64 *discarded_bytes)
1963
{
1964 1965
	int j, ret = 0;
	u64 bytes_left, end;
1966
	u64 aligned_start = ALIGN(start, 1 << 9);
1967

1968 1969 1970 1971 1972
	if (WARN_ON(start != aligned_start)) {
		len -= aligned_start - start;
		len = round_down(len, 1 << 9);
		start = aligned_start;
	}
1973

1974
	*discarded_bytes = 0;
1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025

	if (!len)
		return 0;

	end = start + len;
	bytes_left = len;

	/* Skip any superblocks on this device. */
	for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
		u64 sb_start = btrfs_sb_offset(j);
		u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
		u64 size = sb_start - start;

		if (!in_range(sb_start, start, bytes_left) &&
		    !in_range(sb_end, start, bytes_left) &&
		    !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
			continue;

		/*
		 * Superblock spans beginning of range.  Adjust start and
		 * try again.
		 */
		if (sb_start <= start) {
			start += sb_end - start;
			if (start > end) {
				bytes_left = 0;
				break;
			}
			bytes_left = end - start;
			continue;
		}

		if (size) {
			ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
						   GFP_NOFS, 0);
			if (!ret)
				*discarded_bytes += size;
			else if (ret != -EOPNOTSUPP)
				return ret;
		}

		start = sb_end;
		if (start > end) {
			bytes_left = 0;
			break;
		}
		bytes_left = end - start;
	}

	if (bytes_left) {
		ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
2026 2027
					   GFP_NOFS, 0);
		if (!ret)
2028
			*discarded_bytes += bytes_left;
2029
	}
2030
	return ret;
2031 2032
}

2033 2034
int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
			 u64 num_bytes, u64 *actual_bytes)
2035 2036
{
	int ret;
2037
	u64 discarded_bytes = 0;
2038
	struct btrfs_bio *bbio = NULL;
2039

C
Christoph Hellwig 已提交
2040

2041 2042 2043 2044 2045
	/*
	 * Avoid races with device replace and make sure our bbio has devices
	 * associated to its stripes that don't go away while we are discarding.
	 */
	btrfs_bio_counter_inc_blocked(root->fs_info);
2046
	/* Tell the block device(s) that the sectors can be discarded */
2047
	ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD,
2048
			      bytenr, &num_bytes, &bbio, 0);
2049
	/* Error condition is -ENOMEM */
2050
	if (!ret) {
2051
		struct btrfs_bio_stripe *stripe = bbio->stripes;
2052 2053 2054
		int i;


2055
		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
2056
			u64 bytes;
2057 2058 2059
			if (!stripe->dev->can_discard)
				continue;

2060 2061
			ret = btrfs_issue_discard(stripe->dev->bdev,
						  stripe->physical,
2062 2063
						  stripe->length,
						  &bytes);
2064
			if (!ret)
2065
				discarded_bytes += bytes;
2066
			else if (ret != -EOPNOTSUPP)
2067
				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
2068 2069 2070 2071 2072 2073 2074

			/*
			 * Just in case we get back EOPNOTSUPP for some reason,
			 * just ignore the return value so we don't screw up
			 * people calling discard_extent.
			 */
			ret = 0;
2075
		}
2076
		btrfs_put_bbio(bbio);
2077
	}
2078
	btrfs_bio_counter_dec(root->fs_info);
2079 2080 2081 2082

	if (actual_bytes)
		*actual_bytes = discarded_bytes;

2083

D
David Woodhouse 已提交
2084 2085
	if (ret == -EOPNOTSUPP)
		ret = 0;
2086 2087 2088
	return ret;
}

2089
/* Can return -ENOMEM */
2090 2091 2092
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root,
			 u64 bytenr, u64 num_bytes, u64 parent,
2093
			 u64 root_objectid, u64 owner, u64 offset)
2094 2095
{
	int ret;
A
Arne Jansen 已提交
2096 2097
	struct btrfs_fs_info *fs_info = root->fs_info;

2098 2099 2100 2101
	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
	       root_objectid == BTRFS_TREE_LOG_OBJECTID);

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
A
Arne Jansen 已提交
2102 2103
		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
					num_bytes,
2104
					parent, root_objectid, (int)owner,
2105
					BTRFS_ADD_DELAYED_REF, NULL);
2106
	} else {
A
Arne Jansen 已提交
2107
		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2108 2109
					num_bytes, parent, root_objectid,
					owner, offset, 0,
2110
					BTRFS_ADD_DELAYED_REF, NULL);
2111 2112 2113 2114 2115 2116
	}
	return ret;
}

static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
2117
				  struct btrfs_delayed_ref_node *node,
2118 2119 2120 2121
				  u64 parent, u64 root_objectid,
				  u64 owner, u64 offset, int refs_to_add,
				  struct btrfs_delayed_extent_op *extent_op)
{
J
Josef Bacik 已提交
2122
	struct btrfs_fs_info *fs_info = root->fs_info;
2123 2124 2125
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_extent_item *item;
J
Josef Bacik 已提交
2126
	struct btrfs_key key;
2127 2128
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
2129 2130 2131 2132 2133 2134 2135
	u64 refs;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2136
	path->reada = READA_FORWARD;
2137 2138
	path->leave_spinning = 1;
	/* this will setup the path even if it fails to insert the back ref */
J
Josef Bacik 已提交
2139 2140
	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
					   bytenr, num_bytes, parent,
2141 2142
					   root_objectid, owner, offset,
					   refs_to_add, extent_op);
2143
	if ((ret < 0 && ret != -EAGAIN) || !ret)
2144
		goto out;
J
Josef Bacik 已提交
2145 2146 2147 2148 2149 2150

	/*
	 * Ok we had -EAGAIN which means we didn't have space to insert and
	 * inline extent ref, so just update the reference count and add a
	 * normal backref.
	 */
2151
	leaf = path->nodes[0];
J
Josef Bacik 已提交
2152
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2153 2154 2155 2156 2157
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	refs = btrfs_extent_refs(leaf, item);
	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
	if (extent_op)
		__run_delayed_extent_op(extent_op, leaf, item);
2158

2159
	btrfs_mark_buffer_dirty(leaf);
2160
	btrfs_release_path(path);
2161

2162
	path->reada = READA_FORWARD;
2163
	path->leave_spinning = 1;
2164 2165
	/* now insert the actual backref */
	ret = insert_extent_backref(trans, root->fs_info->extent_root,
2166 2167
				    path, bytenr, parent, root_objectid,
				    owner, offset, refs_to_add);
2168
	if (ret)
2169
		btrfs_abort_transaction(trans, ret);
2170
out:
2171
	btrfs_free_path(path);
2172
	return ret;
2173 2174
}

2175 2176 2177 2178 2179
static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
2180
{
2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192
	int ret = 0;
	struct btrfs_delayed_data_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
	u64 flags = 0;

	ins.objectid = node->bytenr;
	ins.offset = node->num_bytes;
	ins.type = BTRFS_EXTENT_ITEM_KEY;

	ref = btrfs_delayed_node_to_data_ref(node);
2193
	trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
2194

2195 2196
	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
2197
	ref_root = ref->root;
2198 2199

	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2200
		if (extent_op)
2201 2202 2203 2204 2205 2206
			flags |= extent_op->flags_to_set;
		ret = alloc_reserved_file_extent(trans, root,
						 parent, ref_root, flags,
						 ref->objectid, ref->offset,
						 &ins, node->ref_mod);
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2207
		ret = __btrfs_inc_extent_ref(trans, root, node, parent,
2208 2209
					     ref_root, ref->objectid,
					     ref->offset, node->ref_mod,
2210
					     extent_op);
2211
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2212
		ret = __btrfs_free_extent(trans, root, node, parent,
2213 2214
					  ref_root, ref->objectid,
					  ref->offset, node->ref_mod,
2215
					  extent_op);
2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249
	} else {
		BUG();
	}
	return ret;
}

static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
				    struct extent_buffer *leaf,
				    struct btrfs_extent_item *ei)
{
	u64 flags = btrfs_extent_flags(leaf, ei);
	if (extent_op->update_flags) {
		flags |= extent_op->flags_to_set;
		btrfs_set_extent_flags(leaf, ei, flags);
	}

	if (extent_op->update_key) {
		struct btrfs_tree_block_info *bi;
		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
	}
}

static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_delayed_ref_node *node,
				 struct btrfs_delayed_extent_op *extent_op)
{
	struct btrfs_key key;
	struct btrfs_path *path;
	struct btrfs_extent_item *ei;
	struct extent_buffer *leaf;
	u32 item_size;
2250
	int ret;
2251
	int err = 0;
2252
	int metadata = !extent_op->is_data;
2253

2254 2255 2256
	if (trans->aborted)
		return 0;

2257 2258 2259
	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
		metadata = 0;

2260 2261 2262 2263 2264 2265
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = node->bytenr;

2266 2267
	if (metadata) {
		key.type = BTRFS_METADATA_ITEM_KEY;
2268
		key.offset = extent_op->level;
2269 2270 2271 2272 2273 2274
	} else {
		key.type = BTRFS_EXTENT_ITEM_KEY;
		key.offset = node->num_bytes;
	}

again:
2275
	path->reada = READA_FORWARD;
2276 2277 2278 2279 2280 2281 2282 2283
	path->leave_spinning = 1;
	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
				path, 0, 1);
	if (ret < 0) {
		err = ret;
		goto out;
	}
	if (ret > 0) {
2284
		if (metadata) {
2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296
			if (path->slots[0] > 0) {
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == node->bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == node->num_bytes)
					ret = 0;
			}
			if (ret > 0) {
				btrfs_release_path(path);
				metadata = 0;
2297

2298 2299 2300 2301 2302 2303 2304 2305
				key.objectid = node->bytenr;
				key.offset = node->num_bytes;
				key.type = BTRFS_EXTENT_ITEM_KEY;
				goto again;
			}
		} else {
			err = -EIO;
			goto out;
2306
		}
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
	}

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (item_size < sizeof(*ei)) {
		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
					     path, (u64)-1, 0);
		if (ret < 0) {
			err = ret;
			goto out;
		}
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	}
#endif
	BUG_ON(item_size < sizeof(*ei));
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
	__run_delayed_extent_op(extent_op, leaf, ei);
2326

2327 2328 2329 2330
	btrfs_mark_buffer_dirty(leaf);
out:
	btrfs_free_path(path);
	return err;
2331 2332
}

2333 2334 2335 2336 2337
static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				struct btrfs_delayed_ref_node *node,
				struct btrfs_delayed_extent_op *extent_op,
				int insert_reserved)
2338 2339
{
	int ret = 0;
2340 2341 2342 2343
	struct btrfs_delayed_tree_ref *ref;
	struct btrfs_key ins;
	u64 parent = 0;
	u64 ref_root = 0;
2344 2345
	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
						 SKINNY_METADATA);
2346

2347
	ref = btrfs_delayed_node_to_tree_ref(node);
2348
	trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
2349

2350 2351
	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		parent = ref->parent;
J
Josef Bacik 已提交
2352
	ref_root = ref->root;
2353

2354 2355 2356 2357 2358 2359 2360 2361 2362
	ins.objectid = node->bytenr;
	if (skinny_metadata) {
		ins.offset = ref->level;
		ins.type = BTRFS_METADATA_ITEM_KEY;
	} else {
		ins.offset = node->num_bytes;
		ins.type = BTRFS_EXTENT_ITEM_KEY;
	}

2363 2364
	BUG_ON(node->ref_mod != 1);
	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2365
		BUG_ON(!extent_op || !extent_op->update_flags);
2366 2367 2368 2369
		ret = alloc_reserved_tree_block(trans, root,
						parent, ref_root,
						extent_op->flags_to_set,
						&extent_op->key,
2370
						ref->level, &ins);
2371
	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
2372 2373 2374
		ret = __btrfs_inc_extent_ref(trans, root, node,
					     parent, ref_root,
					     ref->level, 0, 1,
J
Josef Bacik 已提交
2375
					     extent_op);
2376
	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
2377 2378 2379
		ret = __btrfs_free_extent(trans, root, node,
					  parent, ref_root,
					  ref->level, 0, 1, extent_op);
2380 2381 2382
	} else {
		BUG();
	}
2383 2384 2385 2386
	return ret;
}

/* helper function to actually process a single delayed ref entry */
2387 2388 2389 2390 2391
static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_delayed_ref_node *node,
			       struct btrfs_delayed_extent_op *extent_op,
			       int insert_reserved)
2392
{
2393 2394
	int ret = 0;

2395 2396 2397 2398
	if (trans->aborted) {
		if (insert_reserved)
			btrfs_pin_extent(root, node->bytenr,
					 node->num_bytes, 1);
2399
		return 0;
2400
	}
2401

2402
	if (btrfs_delayed_ref_is_head(node)) {
2403 2404 2405 2406 2407 2408 2409
		struct btrfs_delayed_ref_head *head;
		/*
		 * we've hit the end of the chain and we were supposed
		 * to insert this extent into the tree.  But, it got
		 * deleted before we ever needed to insert it, so all
		 * we have to do is clean up the accounting
		 */
2410 2411
		BUG_ON(extent_op);
		head = btrfs_delayed_node_to_head(node);
2412 2413
		trace_run_delayed_ref_head(root->fs_info, node, head,
					   node->action);
2414

2415
		if (insert_reserved) {
2416 2417
			btrfs_pin_extent(root, node->bytenr,
					 node->num_bytes, 1);
2418 2419 2420 2421 2422
			if (head->is_data) {
				ret = btrfs_del_csums(trans, root,
						      node->bytenr,
						      node->num_bytes);
			}
2423
		}
2424 2425 2426 2427 2428

		/* Also free its reserved qgroup space */
		btrfs_qgroup_free_delayed_ref(root->fs_info,
					      head->qgroup_ref_root,
					      head->qgroup_reserved);
2429
		return ret;
2430 2431
	}

2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442
	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
		ret = run_delayed_tree_ref(trans, root, node, extent_op,
					   insert_reserved);
	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
		 node->type == BTRFS_SHARED_DATA_REF_KEY)
		ret = run_delayed_data_ref(trans, root, node, extent_op,
					   insert_reserved);
	else
		BUG();
	return ret;
2443 2444
}

2445
static inline struct btrfs_delayed_ref_node *
2446 2447
select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
2448 2449
	struct btrfs_delayed_ref_node *ref;

2450 2451
	if (list_empty(&head->ref_list))
		return NULL;
2452

2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463
	/*
	 * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
	 * This is to prevent a ref count from going down to zero, which deletes
	 * the extent item from the extent tree, when there still are references
	 * to add, which would fail because they would not find the extent item.
	 */
	list_for_each_entry(ref, &head->ref_list, list) {
		if (ref->action == BTRFS_ADD_DELAYED_REF)
			return ref;
	}

2464 2465
	return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
			  list);
2466 2467
}

2468 2469 2470 2471
/*
 * Returns 0 on success or if called with an already aborted transaction.
 * Returns -ENOMEM or -EIO on failure and will abort the transaction.
 */
2472 2473 2474
static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
					     struct btrfs_root *root,
					     unsigned long nr)
2475 2476 2477 2478
{
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_ref_head *locked_ref = NULL;
2479
	struct btrfs_delayed_extent_op *extent_op;
2480
	struct btrfs_fs_info *fs_info = root->fs_info;
2481
	ktime_t start = ktime_get();
2482
	int ret;
2483
	unsigned long count = 0;
2484
	unsigned long actual_count = 0;
2485 2486 2487 2488 2489
	int must_insert_reserved = 0;

	delayed_refs = &trans->transaction->delayed_refs;
	while (1) {
		if (!locked_ref) {
2490
			if (count >= nr)
2491 2492
				break;

2493 2494 2495 2496 2497 2498
			spin_lock(&delayed_refs->lock);
			locked_ref = btrfs_select_ref_head(trans);
			if (!locked_ref) {
				spin_unlock(&delayed_refs->lock);
				break;
			}
2499 2500 2501 2502

			/* grab the lock that says we are going to process
			 * all the refs for this head */
			ret = btrfs_delayed_ref_lock(trans, locked_ref);
2503
			spin_unlock(&delayed_refs->lock);
2504 2505 2506 2507 2508 2509 2510 2511 2512 2513
			/*
			 * we may have dropped the spin lock to get the head
			 * mutex lock, and that might have given someone else
			 * time to free the head.  If that's true, it has been
			 * removed from our list and we can move on.
			 */
			if (ret == -EAGAIN) {
				locked_ref = NULL;
				count++;
				continue;
2514 2515
			}
		}
2516

2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
		/*
		 * We need to try and merge add/drops of the same ref since we
		 * can run into issues with relocate dropping the implicit ref
		 * and then it being added back again before the drop can
		 * finish.  If we merged anything we need to re-loop so we can
		 * get a good ref.
		 * Or we can get node references of the same type that weren't
		 * merged when created due to bumps in the tree mod seq, and
		 * we need to merge them to prevent adding an inline extent
		 * backref before dropping it (triggering a BUG_ON at
		 * insert_inline_extent_backref()).
		 */
2529
		spin_lock(&locked_ref->lock);
2530 2531
		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
					 locked_ref);
2532

2533 2534 2535 2536 2537 2538 2539
		/*
		 * locked_ref is the head node, so we have to go one
		 * node back for any delayed ref updates
		 */
		ref = select_delayed_ref(locked_ref);

		if (ref && ref->seq &&
2540
		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2541
			spin_unlock(&locked_ref->lock);
2542
			btrfs_delayed_ref_unlock(locked_ref);
2543 2544
			spin_lock(&delayed_refs->lock);
			locked_ref->processing = 0;
2545 2546
			delayed_refs->num_heads_ready++;
			spin_unlock(&delayed_refs->lock);
2547
			locked_ref = NULL;
2548
			cond_resched();
2549
			count++;
2550 2551 2552
			continue;
		}

2553 2554 2555 2556 2557 2558
		/*
		 * record the must insert reserved flag before we
		 * drop the spin lock.
		 */
		must_insert_reserved = locked_ref->must_insert_reserved;
		locked_ref->must_insert_reserved = 0;
2559

2560 2561 2562
		extent_op = locked_ref->extent_op;
		locked_ref->extent_op = NULL;

2563
		if (!ref) {
2564 2565


2566 2567 2568 2569 2570
			/* All delayed refs have been processed, Go ahead
			 * and send the head node to run_one_delayed_ref,
			 * so that any accounting fixes can happen
			 */
			ref = &locked_ref->node;
2571 2572

			if (extent_op && must_insert_reserved) {
2573
				btrfs_free_delayed_extent_op(extent_op);
2574 2575 2576 2577
				extent_op = NULL;
			}

			if (extent_op) {
2578
				spin_unlock(&locked_ref->lock);
2579 2580
				ret = run_delayed_extent_op(trans, root,
							    ref, extent_op);
2581
				btrfs_free_delayed_extent_op(extent_op);
2582

2583
				if (ret) {
2584 2585 2586 2587 2588 2589 2590 2591
					/*
					 * Need to reset must_insert_reserved if
					 * there was an error so the abort stuff
					 * can cleanup the reserved space
					 * properly.
					 */
					if (must_insert_reserved)
						locked_ref->must_insert_reserved = 1;
2592
					locked_ref->processing = 0;
2593
					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2594
					btrfs_delayed_ref_unlock(locked_ref);
2595 2596
					return ret;
				}
2597
				continue;
2598
			}
C
Chris Mason 已提交
2599

2600
			/*
2601
			 * Need to drop our head ref lock and re-acquire the
2602 2603 2604 2605 2606 2607
			 * delayed ref lock and then re-check to make sure
			 * nobody got added.
			 */
			spin_unlock(&locked_ref->lock);
			spin_lock(&delayed_refs->lock);
			spin_lock(&locked_ref->lock);
2608
			if (!list_empty(&locked_ref->ref_list) ||
2609
			    locked_ref->extent_op) {
2610 2611 2612 2613 2614 2615
				spin_unlock(&locked_ref->lock);
				spin_unlock(&delayed_refs->lock);
				continue;
			}
			ref->in_tree = 0;
			delayed_refs->num_heads--;
L
Liu Bo 已提交
2616 2617
			rb_erase(&locked_ref->href_node,
				 &delayed_refs->href_root);
2618 2619
			spin_unlock(&delayed_refs->lock);
		} else {
2620
			actual_count++;
2621
			ref->in_tree = 0;
2622
			list_del(&ref->list);
L
Liu Bo 已提交
2623
		}
2624 2625
		atomic_dec(&delayed_refs->num_entries);

2626
		if (!btrfs_delayed_ref_is_head(ref)) {
2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642
			/*
			 * when we play the delayed ref, also correct the
			 * ref_mod on head
			 */
			switch (ref->action) {
			case BTRFS_ADD_DELAYED_REF:
			case BTRFS_ADD_DELAYED_EXTENT:
				locked_ref->node.ref_mod -= ref->ref_mod;
				break;
			case BTRFS_DROP_DELAYED_REF:
				locked_ref->node.ref_mod += ref->ref_mod;
				break;
			default:
				WARN_ON(1);
			}
		}
2643
		spin_unlock(&locked_ref->lock);
2644

2645
		ret = run_one_delayed_ref(trans, root, ref, extent_op,
2646
					  must_insert_reserved);
2647

2648
		btrfs_free_delayed_extent_op(extent_op);
2649
		if (ret) {
2650
			locked_ref->processing = 0;
2651 2652
			btrfs_delayed_ref_unlock(locked_ref);
			btrfs_put_delayed_ref(ref);
2653
			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2654 2655 2656
			return ret;
		}

2657 2658 2659 2660 2661 2662 2663
		/*
		 * If this node is a head, that means all the refs in this head
		 * have been dealt with, and we will pick the next head to deal
		 * with, so we must unlock the head and drop it from the cluster
		 * list before we release it.
		 */
		if (btrfs_delayed_ref_is_head(ref)) {
2664 2665 2666 2667 2668 2669
			if (locked_ref->is_data &&
			    locked_ref->total_ref_mod < 0) {
				spin_lock(&delayed_refs->lock);
				delayed_refs->pending_csums -= ref->num_bytes;
				spin_unlock(&delayed_refs->lock);
			}
2670 2671 2672 2673 2674
			btrfs_delayed_ref_unlock(locked_ref);
			locked_ref = NULL;
		}
		btrfs_put_delayed_ref(ref);
		count++;
2675 2676
		cond_resched();
	}
2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692

	/*
	 * We don't want to include ref heads since we can have empty ref heads
	 * and those will drastically skew our runtime down since we just do
	 * accounting, no actual extent tree updates.
	 */
	if (actual_count > 0) {
		u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
		u64 avg;

		/*
		 * We weigh the current average higher than our current runtime
		 * to avoid large swings in the average.
		 */
		spin_lock(&delayed_refs->lock);
		avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2693
		fs_info->avg_delayed_ref_runtime = avg >> 2;	/* div by 4 */
2694 2695
		spin_unlock(&delayed_refs->lock);
	}
2696
	return 0;
2697 2698
}

2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741
#ifdef SCRAMBLE_DELAYED_REFS
/*
 * Normally delayed refs get processed in ascending bytenr order. This
 * correlates in most cases to the order added. To expose dependencies on this
 * order, we start to process the tree in the middle instead of the beginning
 */
static u64 find_middle(struct rb_root *root)
{
	struct rb_node *n = root->rb_node;
	struct btrfs_delayed_ref_node *entry;
	int alt = 1;
	u64 middle;
	u64 first = 0, last = 0;

	n = rb_first(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		first = entry->bytenr;
	}
	n = rb_last(root);
	if (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		last = entry->bytenr;
	}
	n = root->rb_node;

	while (n) {
		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
		WARN_ON(!entry->in_tree);

		middle = entry->bytenr;

		if (alt)
			n = n->rb_left;
		else
			n = n->rb_right;

		alt = 1 - alt;
	}
	return middle;
}
#endif

2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752
static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
{
	u64 num_bytes;

	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
			     sizeof(struct btrfs_extent_inline_ref));
	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
		num_bytes += heads * sizeof(struct btrfs_tree_block_info);

	/*
	 * We don't ever fill up leaves all the way so multiply by 2 just to be
2753
	 * closer to what we're really going to want to use.
2754
	 */
2755
	return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
2756 2757
}

2758 2759 2760 2761
/*
 * Takes the number of bytes to be csumm'ed and figures out how many leaves it
 * would require to store the csums for that many bytes.
 */
2762
u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
2763 2764 2765 2766 2767
{
	u64 csum_size;
	u64 num_csums_per_leaf;
	u64 num_csums;

2768
	csum_size = BTRFS_MAX_ITEM_SIZE(root);
2769 2770 2771 2772 2773 2774 2775 2776
	num_csums_per_leaf = div64_u64(csum_size,
			(u64)btrfs_super_csum_size(root->fs_info->super_copy));
	num_csums = div64_u64(csum_bytes, root->sectorsize);
	num_csums += num_csums_per_leaf - 1;
	num_csums = div64_u64(num_csums, num_csums_per_leaf);
	return num_csums;
}

2777
int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
2778 2779 2780 2781
				       struct btrfs_root *root)
{
	struct btrfs_block_rsv *global_rsv;
	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
2782
	u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
2783 2784
	u64 num_dirty_bgs = trans->transaction->num_dirty_bgs;
	u64 num_bytes, num_dirty_bgs_bytes;
2785 2786 2787 2788 2789
	int ret = 0;

	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
	num_heads = heads_to_leaves(root, num_heads);
	if (num_heads > 1)
2790
		num_bytes += (num_heads - 1) * root->nodesize;
2791
	num_bytes <<= 1;
2792
	num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize;
2793 2794
	num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root,
							     num_dirty_bgs);
2795 2796 2797 2798 2799 2800
	global_rsv = &root->fs_info->global_block_rsv;

	/*
	 * If we can't allocate any more chunks lets make sure we have _lots_ of
	 * wiggle room since running delayed refs can create more delayed refs.
	 */
2801 2802
	if (global_rsv->space_info->full) {
		num_dirty_bgs_bytes <<= 1;
2803
		num_bytes <<= 1;
2804
	}
2805 2806

	spin_lock(&global_rsv->lock);
2807
	if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
2808 2809 2810 2811 2812
		ret = 1;
	spin_unlock(&global_rsv->lock);
	return ret;
}

2813 2814 2815 2816 2817 2818 2819
int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	u64 num_entries =
		atomic_read(&trans->transaction->delayed_refs.num_entries);
	u64 avg_runtime;
C
Chris Mason 已提交
2820
	u64 val;
2821 2822 2823

	smp_mb();
	avg_runtime = fs_info->avg_delayed_ref_runtime;
C
Chris Mason 已提交
2824
	val = num_entries * avg_runtime;
2825 2826
	if (num_entries * avg_runtime >= NSEC_PER_SEC)
		return 1;
C
Chris Mason 已提交
2827 2828
	if (val >= NSEC_PER_SEC / 2)
		return 2;
2829 2830 2831 2832

	return btrfs_check_space_for_delayed_refs(trans, root);
}

C
Chris Mason 已提交
2833 2834
struct async_delayed_refs {
	struct btrfs_root *root;
2835
	u64 transid;
C
Chris Mason 已提交
2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850
	int count;
	int error;
	int sync;
	struct completion wait;
	struct btrfs_work work;
};

static void delayed_ref_async_start(struct btrfs_work *work)
{
	struct async_delayed_refs *async;
	struct btrfs_trans_handle *trans;
	int ret;

	async = container_of(work, struct async_delayed_refs, work);

2851 2852
	/* if the commit is already started, we don't need to wait here */
	if (btrfs_transaction_blocked(async->root->fs_info))
2853 2854
		goto done;

2855 2856 2857
	trans = btrfs_join_transaction(async->root);
	if (IS_ERR(trans)) {
		async->error = PTR_ERR(trans);
C
Chris Mason 已提交
2858 2859 2860 2861
		goto done;
	}

	/*
2862
	 * trans->sync means that when we call end_transaction, we won't
C
Chris Mason 已提交
2863 2864 2865
	 * wait on delayed refs
	 */
	trans->sync = true;
2866 2867 2868 2869 2870

	/* Don't bother flushing if we got into a different transaction */
	if (trans->transid > async->transid)
		goto end;

C
Chris Mason 已提交
2871 2872 2873
	ret = btrfs_run_delayed_refs(trans, async->root, async->count);
	if (ret)
		async->error = ret;
2874
end:
C
Chris Mason 已提交
2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885
	ret = btrfs_end_transaction(trans, async->root);
	if (ret && !async->error)
		async->error = ret;
done:
	if (async->sync)
		complete(&async->wait);
	else
		kfree(async);
}

int btrfs_async_run_delayed_refs(struct btrfs_root *root,
2886
				 unsigned long count, u64 transid, int wait)
C
Chris Mason 已提交
2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897
{
	struct async_delayed_refs *async;
	int ret;

	async = kmalloc(sizeof(*async), GFP_NOFS);
	if (!async)
		return -ENOMEM;

	async->root = root->fs_info->tree_root;
	async->count = count;
	async->error = 0;
2898
	async->transid = transid;
C
Chris Mason 已提交
2899 2900 2901 2902 2903 2904
	if (wait)
		async->sync = 1;
	else
		async->sync = 0;
	init_completion(&async->wait);

2905 2906
	btrfs_init_work(&async->work, btrfs_extent_refs_helper,
			delayed_ref_async_start, NULL, NULL);
C
Chris Mason 已提交
2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918

	btrfs_queue_work(root->fs_info->extent_workers, &async->work);

	if (wait) {
		wait_for_completion(&async->wait);
		ret = async->error;
		kfree(async);
		return ret;
	}
	return 0;
}

2919 2920 2921 2922 2923 2924
/*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
 * 0, which means to process everything in the tree at the start
 * of the run (but not newly added entries), or it can be some target
 * number you'd like to process.
2925 2926 2927
 *
 * Returns 0 on success or if called with an aborted transaction
 * Returns <0 on error and aborts the transaction
2928 2929 2930 2931 2932 2933
 */
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root, unsigned long count)
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
L
Liu Bo 已提交
2934
	struct btrfs_delayed_ref_head *head;
2935 2936
	int ret;
	int run_all = count == (unsigned long)-1;
2937
	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
2938

2939 2940 2941 2942
	/* We'll clean this up in btrfs_cleanup_transaction */
	if (trans->aborted)
		return 0;

2943 2944 2945
	if (root->fs_info->creating_free_space_tree)
		return 0;

2946 2947 2948 2949
	if (root == root->fs_info->extent_root)
		root = root->fs_info->tree_root;

	delayed_refs = &trans->transaction->delayed_refs;
L
Liu Bo 已提交
2950
	if (count == 0)
2951
		count = atomic_read(&delayed_refs->num_entries) * 2;
2952

2953
again:
2954 2955 2956
#ifdef SCRAMBLE_DELAYED_REFS
	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
2957
	trans->can_flush_pending_bgs = false;
2958 2959
	ret = __btrfs_run_delayed_refs(trans, root, count);
	if (ret < 0) {
2960
		btrfs_abort_transaction(trans, ret);
2961
		return ret;
2962
	}
2963

2964
	if (run_all) {
2965
		if (!list_empty(&trans->new_bgs))
2966 2967
			btrfs_create_pending_block_groups(trans, root);

2968
		spin_lock(&delayed_refs->lock);
L
Liu Bo 已提交
2969
		node = rb_first(&delayed_refs->href_root);
2970 2971
		if (!node) {
			spin_unlock(&delayed_refs->lock);
2972
			goto out;
2973
		}
2974
		count = (unsigned long)-1;
2975

2976
		while (node) {
L
Liu Bo 已提交
2977 2978 2979 2980
			head = rb_entry(node, struct btrfs_delayed_ref_head,
					href_node);
			if (btrfs_delayed_ref_is_head(&head->node)) {
				struct btrfs_delayed_ref_node *ref;
2981

L
Liu Bo 已提交
2982
				ref = &head->node;
2983 2984 2985
				atomic_inc(&ref->refs);

				spin_unlock(&delayed_refs->lock);
2986 2987 2988 2989
				/*
				 * Mutex was contended, block until it's
				 * released and try again
				 */
2990 2991 2992 2993
				mutex_lock(&head->mutex);
				mutex_unlock(&head->mutex);

				btrfs_put_delayed_ref(ref);
2994
				cond_resched();
2995
				goto again;
L
Liu Bo 已提交
2996 2997
			} else {
				WARN_ON(1);
2998 2999 3000 3001
			}
			node = rb_next(node);
		}
		spin_unlock(&delayed_refs->lock);
3002
		cond_resched();
3003
		goto again;
3004
	}
3005
out:
3006
	assert_qgroups_uptodate(trans);
3007
	trans->can_flush_pending_bgs = can_flush_pending_bgs;
3008 3009 3010
	return 0;
}

3011 3012 3013
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
				u64 bytenr, u64 num_bytes, u64 flags,
3014
				int level, int is_data)
3015 3016 3017 3018
{
	struct btrfs_delayed_extent_op *extent_op;
	int ret;

3019
	extent_op = btrfs_alloc_delayed_extent_op();
3020 3021 3022 3023
	if (!extent_op)
		return -ENOMEM;

	extent_op->flags_to_set = flags;
3024 3025 3026
	extent_op->update_flags = true;
	extent_op->update_key = false;
	extent_op->is_data = is_data ? true : false;
3027
	extent_op->level = level;
3028

A
Arne Jansen 已提交
3029 3030
	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
					  num_bytes, extent_op);
3031
	if (ret)
3032
		btrfs_free_delayed_extent_op(extent_op);
3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049
	return ret;
}

static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root,
				      struct btrfs_path *path,
				      u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_node *ref;
	struct btrfs_delayed_data_ref *data_ref;
	struct btrfs_delayed_ref_root *delayed_refs;
	int ret = 0;

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
	head = btrfs_find_delayed_ref_head(trans, bytenr);
3050 3051 3052 3053
	if (!head) {
		spin_unlock(&delayed_refs->lock);
		return 0;
	}
3054 3055 3056 3057 3058

	if (!mutex_trylock(&head->mutex)) {
		atomic_inc(&head->node.refs);
		spin_unlock(&delayed_refs->lock);

3059
		btrfs_release_path(path);
3060

3061 3062 3063 3064
		/*
		 * Mutex was contended, block until it's released and let
		 * caller try again
		 */
3065 3066 3067 3068 3069
		mutex_lock(&head->mutex);
		mutex_unlock(&head->mutex);
		btrfs_put_delayed_ref(&head->node);
		return -EAGAIN;
	}
3070
	spin_unlock(&delayed_refs->lock);
3071

3072
	spin_lock(&head->lock);
3073
	list_for_each_entry(ref, &head->ref_list, list) {
3074 3075 3076 3077 3078
		/* If it's a shared ref we know a cross reference exists */
		if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
			ret = 1;
			break;
		}
3079

3080
		data_ref = btrfs_delayed_node_to_data_ref(ref);
3081

3082 3083 3084 3085 3086 3087 3088 3089 3090 3091
		/*
		 * If our ref doesn't match the one we're currently looking at
		 * then we have a cross reference.
		 */
		if (data_ref->root != root->root_key.objectid ||
		    data_ref->objectid != objectid ||
		    data_ref->offset != offset) {
			ret = 1;
			break;
		}
3092
	}
3093
	spin_unlock(&head->lock);
3094 3095 3096 3097 3098 3099 3100 3101
	mutex_unlock(&head->mutex);
	return ret;
}

static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
					struct btrfs_root *root,
					struct btrfs_path *path,
					u64 objectid, u64 offset, u64 bytenr)
3102 3103
{
	struct btrfs_root *extent_root = root->fs_info->extent_root;
3104
	struct extent_buffer *leaf;
3105 3106 3107
	struct btrfs_extent_data_ref *ref;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_extent_item *ei;
3108
	struct btrfs_key key;
3109
	u32 item_size;
3110
	int ret;
3111

3112
	key.objectid = bytenr;
Z
Zheng Yan 已提交
3113
	key.offset = (u64)-1;
3114
	key.type = BTRFS_EXTENT_ITEM_KEY;
3115 3116 3117 3118

	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
3119
	BUG_ON(ret == 0); /* Corruption */
Y
Yan Zheng 已提交
3120 3121 3122

	ret = -ENOENT;
	if (path->slots[0] == 0)
Z
Zheng Yan 已提交
3123
		goto out;
3124

Z
Zheng Yan 已提交
3125
	path->slots[0]--;
3126
	leaf = path->nodes[0];
3127
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3128

3129
	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
3130
		goto out;
3131

3132 3133 3134 3135 3136 3137 3138 3139 3140
	ret = 1;
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (item_size < sizeof(*ei)) {
		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
		goto out;
	}
#endif
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
3141

3142 3143 3144
	if (item_size != sizeof(*ei) +
	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
		goto out;
3145

3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184
	if (btrfs_extent_generation(leaf, ei) <=
	    btrfs_root_last_snapshot(&root->root_item))
		goto out;

	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
	if (btrfs_extent_inline_ref_type(leaf, iref) !=
	    BTRFS_EXTENT_DATA_REF_KEY)
		goto out;

	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
	if (btrfs_extent_refs(leaf, ei) !=
	    btrfs_extent_data_ref_count(leaf, ref) ||
	    btrfs_extent_data_ref_root(leaf, ref) !=
	    root->root_key.objectid ||
	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
		goto out;

	ret = 0;
out:
	return ret;
}

int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root,
			  u64 objectid, u64 offset, u64 bytenr)
{
	struct btrfs_path *path;
	int ret;
	int ret2;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOENT;

	do {
		ret = check_committed_ref(trans, root, path, objectid,
					  offset, bytenr);
		if (ret && ret != -ENOENT)
3185
			goto out;
Y
Yan Zheng 已提交
3186

3187 3188 3189 3190 3191 3192 3193
		ret2 = check_delayed_ref(trans, root, path, objectid,
					 offset, bytenr);
	} while (ret2 == -EAGAIN);

	if (ret2 && ret2 != -ENOENT) {
		ret = ret2;
		goto out;
3194
	}
3195 3196 3197

	if (ret != -ENOENT || ret2 != -ENOENT)
		ret = 0;
3198
out:
Y
Yan Zheng 已提交
3199
	btrfs_free_path(path);
3200 3201
	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
		WARN_ON(ret > 0);
3202
	return ret;
3203
}
C
Chris Mason 已提交
3204

3205
static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3206
			   struct btrfs_root *root,
3207
			   struct extent_buffer *buf,
3208
			   int full_backref, int inc)
Z
Zheng Yan 已提交
3209 3210
{
	u64 bytenr;
3211 3212
	u64 num_bytes;
	u64 parent;
Z
Zheng Yan 已提交
3213 3214 3215 3216 3217 3218 3219 3220
	u64 ref_root;
	u32 nritems;
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
	int i;
	int level;
	int ret = 0;
	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3221
			    u64, u64, u64, u64, u64, u64);
Z
Zheng Yan 已提交
3222

3223

3224
	if (btrfs_is_testing(root->fs_info))
3225
		return 0;
3226

Z
Zheng Yan 已提交
3227 3228 3229 3230
	ref_root = btrfs_header_owner(buf);
	nritems = btrfs_header_nritems(buf);
	level = btrfs_header_level(buf);

3231
	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
3232
		return 0;
Z
Zheng Yan 已提交
3233

3234 3235 3236 3237
	if (inc)
		process_func = btrfs_inc_extent_ref;
	else
		process_func = btrfs_free_extent;
Z
Zheng Yan 已提交
3238

3239 3240 3241 3242 3243 3244
	if (full_backref)
		parent = buf->start;
	else
		parent = 0;

	for (i = 0; i < nritems; i++) {
Z
Zheng Yan 已提交
3245
		if (level == 0) {
3246
			btrfs_item_key_to_cpu(buf, &key, i);
3247
			if (key.type != BTRFS_EXTENT_DATA_KEY)
Z
Zheng Yan 已提交
3248
				continue;
3249
			fi = btrfs_item_ptr(buf, i,
Z
Zheng Yan 已提交
3250 3251 3252 3253 3254 3255 3256
					    struct btrfs_file_extent_item);
			if (btrfs_file_extent_type(buf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE)
				continue;
			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
			if (bytenr == 0)
				continue;
3257 3258 3259 3260 3261

			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
			key.offset -= btrfs_file_extent_offset(buf, fi);
			ret = process_func(trans, root, bytenr, num_bytes,
					   parent, ref_root, key.objectid,
3262
					   key.offset);
Z
Zheng Yan 已提交
3263 3264 3265
			if (ret)
				goto fail;
		} else {
3266
			bytenr = btrfs_node_blockptr(buf, i);
3267
			num_bytes = root->nodesize;
3268
			ret = process_func(trans, root, bytenr, num_bytes,
3269
					   parent, ref_root, level - 1, 0);
Z
Zheng Yan 已提交
3270 3271 3272 3273 3274 3275
			if (ret)
				goto fail;
		}
	}
	return 0;
fail:
3276 3277 3278 3279
	return ret;
}

int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3280
		  struct extent_buffer *buf, int full_backref)
3281
{
3282
	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
3283 3284 3285
}

int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3286
		  struct extent_buffer *buf, int full_backref)
3287
{
3288
	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
Z
Zheng Yan 已提交
3289 3290
}

C
Chris Mason 已提交
3291 3292 3293 3294 3295 3296 3297
static int write_one_cache_group(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct btrfs_block_group_cache *cache)
{
	int ret;
	struct btrfs_root *extent_root = root->fs_info->extent_root;
3298 3299
	unsigned long bi;
	struct extent_buffer *leaf;
C
Chris Mason 已提交
3300 3301

	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3302 3303 3304
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
3305
		goto fail;
3306
	}
3307 3308 3309 3310 3311

	leaf = path->nodes[0];
	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
	btrfs_mark_buffer_dirty(leaf);
3312
fail:
3313
	btrfs_release_path(path);
3314
	return ret;
C
Chris Mason 已提交
3315 3316 3317

}

3318 3319 3320 3321 3322
static struct btrfs_block_group_cache *
next_block_group(struct btrfs_root *root,
		 struct btrfs_block_group_cache *cache)
{
	struct rb_node *node;
3323

3324
	spin_lock(&root->fs_info->block_group_cache_lock);
3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335

	/* If our block group was removed, we need a full search. */
	if (RB_EMPTY_NODE(&cache->cache_node)) {
		const u64 next_bytenr = cache->key.objectid + cache->key.offset;

		spin_unlock(&root->fs_info->block_group_cache_lock);
		btrfs_put_block_group(cache);
		cache = btrfs_lookup_first_block_group(root->fs_info,
						       next_bytenr);
		return cache;
	}
3336 3337 3338 3339 3340
	node = rb_next(&cache->cache_node);
	btrfs_put_block_group(cache);
	if (node) {
		cache = rb_entry(node, struct btrfs_block_group_cache,
				 cache_node);
3341
		btrfs_get_block_group(cache);
3342 3343 3344 3345 3346 3347
	} else
		cache = NULL;
	spin_unlock(&root->fs_info->block_group_cache_lock);
	return cache;
}

3348 3349 3350 3351 3352 3353 3354
static int cache_save_setup(struct btrfs_block_group_cache *block_group,
			    struct btrfs_trans_handle *trans,
			    struct btrfs_path *path)
{
	struct btrfs_root *root = block_group->fs_info->tree_root;
	struct inode *inode = NULL;
	u64 alloc_hint = 0;
3355
	int dcs = BTRFS_DC_ERROR;
3356
	u64 num_pages = 0;
3357 3358 3359 3360 3361 3362 3363
	int retries = 0;
	int ret = 0;

	/*
	 * If this block group is smaller than 100 megs don't bother caching the
	 * block group.
	 */
3364
	if (block_group->key.offset < (100 * SZ_1M)) {
3365 3366 3367 3368 3369 3370
		spin_lock(&block_group->lock);
		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
		spin_unlock(&block_group->lock);
		return 0;
	}

3371 3372
	if (trans->aborted)
		return 0;
3373 3374 3375 3376
again:
	inode = lookup_free_space_inode(root, block_group, path);
	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
		ret = PTR_ERR(inode);
3377
		btrfs_release_path(path);
3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393
		goto out;
	}

	if (IS_ERR(inode)) {
		BUG_ON(retries);
		retries++;

		if (block_group->ro)
			goto out_free;

		ret = create_free_space_inode(root, trans, block_group, path);
		if (ret)
			goto out_free;
		goto again;
	}

3394 3395 3396 3397 3398 3399 3400
	/* We've already setup this transaction, go ahead and exit */
	if (block_group->cache_generation == trans->transid &&
	    i_size_read(inode)) {
		dcs = BTRFS_DC_SETUP;
		goto out_put;
	}

3401 3402 3403 3404 3405 3406 3407
	/*
	 * We want to set the generation to 0, that way if anything goes wrong
	 * from here on out we know not to trust this cache when we load up next
	 * time.
	 */
	BTRFS_I(inode)->generation = 0;
	ret = btrfs_update_inode(trans, root, inode);
3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418
	if (ret) {
		/*
		 * So theoretically we could recover from this, simply set the
		 * super cache generation to 0 so we know to invalidate the
		 * cache, but then we'd have to keep track of the block groups
		 * that fail this way so we know we _have_ to reset this cache
		 * before the next commit or risk reading stale cache.  So to
		 * limit our exposure to horrible edge cases lets just abort the
		 * transaction, this only happens in really bad situations
		 * anyway.
		 */
3419
		btrfs_abort_transaction(trans, ret);
3420 3421
		goto out_put;
	}
3422 3423 3424
	WARN_ON(ret);

	if (i_size_read(inode) > 0) {
3425 3426 3427 3428 3429
		ret = btrfs_check_trunc_cache_free_space(root,
					&root->fs_info->global_block_rsv);
		if (ret)
			goto out_put;

3430
		ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
3431 3432 3433 3434 3435
		if (ret)
			goto out_put;
	}

	spin_lock(&block_group->lock);
3436
	if (block_group->cached != BTRFS_CACHE_FINISHED ||
3437
	    !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
3438 3439 3440 3441 3442
		/*
		 * don't bother trying to write stuff out _if_
		 * a) we're not cached,
		 * b) we're with nospace_cache mount option.
		 */
3443
		dcs = BTRFS_DC_WRITTEN;
3444 3445 3446 3447 3448
		spin_unlock(&block_group->lock);
		goto out_put;
	}
	spin_unlock(&block_group->lock);

3449 3450 3451 3452 3453 3454 3455 3456 3457
	/*
	 * We hit an ENOSPC when setting up the cache in this transaction, just
	 * skip doing the setup, we've already cleared the cache so we're safe.
	 */
	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
		ret = -ENOSPC;
		goto out_put;
	}

3458 3459 3460 3461 3462 3463
	/*
	 * Try to preallocate enough space based on how big the block group is.
	 * Keep in mind this has to include any pinned space which could end up
	 * taking up quite a bit since it's not folded into the other space
	 * cache.
	 */
3464
	num_pages = div_u64(block_group->key.offset, SZ_256M);
3465 3466 3467 3468
	if (!num_pages)
		num_pages = 1;

	num_pages *= 16;
3469
	num_pages *= PAGE_SIZE;
3470

3471
	ret = btrfs_check_data_free_space(inode, 0, num_pages);
3472 3473 3474 3475 3476 3477
	if (ret)
		goto out_put;

	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
					      num_pages, num_pages,
					      &alloc_hint);
3478 3479 3480 3481 3482 3483 3484 3485
	/*
	 * Our cache requires contiguous chunks so that we don't modify a bunch
	 * of metadata or split extents when writing the cache out, which means
	 * we can enospc if we are heavily fragmented in addition to just normal
	 * out of space conditions.  So if we hit this just skip setting up any
	 * other block groups for this transaction, maybe we'll unpin enough
	 * space the next time around.
	 */
3486 3487
	if (!ret)
		dcs = BTRFS_DC_SETUP;
3488 3489
	else if (ret == -ENOSPC)
		set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3490

3491 3492 3493
out_put:
	iput(inode);
out_free:
3494
	btrfs_release_path(path);
3495 3496
out:
	spin_lock(&block_group->lock);
3497
	if (!ret && dcs == BTRFS_DC_SETUP)
3498
		block_group->cache_generation = trans->transid;
3499
	block_group->disk_cache_state = dcs;
3500 3501 3502 3503 3504
	spin_unlock(&block_group->lock);

	return ret;
}

3505 3506 3507 3508 3509 3510 3511 3512
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root)
{
	struct btrfs_block_group_cache *cache, *tmp;
	struct btrfs_transaction *cur_trans = trans->transaction;
	struct btrfs_path *path;

	if (list_empty(&cur_trans->dirty_bgs) ||
3513
	    !btrfs_test_opt(root->fs_info, SPACE_CACHE))
3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530
		return 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/* Could add new block groups, use _safe just in case */
	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
				 dirty_list) {
		if (cache->disk_cache_state == BTRFS_DC_CLEAR)
			cache_save_setup(cache, trans, path);
	}

	btrfs_free_path(path);
	return 0;
}

3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543
/*
 * transaction commit does final block group cache writeback during a
 * critical section where nothing is allowed to change the FS.  This is
 * required in order for the cache to actually match the block group,
 * but can introduce a lot of latency into the commit.
 *
 * So, btrfs_start_dirty_block_groups is here to kick off block group
 * cache IO.  There's a chance we'll have to redo some of it if the
 * block group changes again during the commit, but it greatly reduces
 * the commit latency by getting rid of the easy block groups while
 * we're still allowing others to join the commit.
 */
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3544
				   struct btrfs_root *root)
C
Chris Mason 已提交
3545
{
3546
	struct btrfs_block_group_cache *cache;
3547 3548
	struct btrfs_transaction *cur_trans = trans->transaction;
	int ret = 0;
3549
	int should_put;
3550 3551 3552
	struct btrfs_path *path = NULL;
	LIST_HEAD(dirty);
	struct list_head *io = &cur_trans->io_bgs;
3553
	int num_started = 0;
3554 3555 3556
	int loops = 0;

	spin_lock(&cur_trans->dirty_bgs_lock);
3557 3558 3559
	if (list_empty(&cur_trans->dirty_bgs)) {
		spin_unlock(&cur_trans->dirty_bgs_lock);
		return 0;
3560
	}
3561
	list_splice_init(&cur_trans->dirty_bgs, &dirty);
3562
	spin_unlock(&cur_trans->dirty_bgs_lock);
3563

3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576
again:
	/*
	 * make sure all the block groups on our dirty list actually
	 * exist
	 */
	btrfs_create_pending_block_groups(trans, root);

	if (!path) {
		path = btrfs_alloc_path();
		if (!path)
			return -ENOMEM;
	}

3577 3578 3579 3580 3581 3582
	/*
	 * cache_write_mutex is here only to save us from balance or automatic
	 * removal of empty block groups deleting this block group while we are
	 * writing out the cache
	 */
	mutex_lock(&trans->transaction->cache_write_mutex);
3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636
	while (!list_empty(&dirty)) {
		cache = list_first_entry(&dirty,
					 struct btrfs_block_group_cache,
					 dirty_list);
		/*
		 * this can happen if something re-dirties a block
		 * group that is already under IO.  Just wait for it to
		 * finish and then do it all again
		 */
		if (!list_empty(&cache->io_list)) {
			list_del_init(&cache->io_list);
			btrfs_wait_cache_io(root, trans, cache,
					    &cache->io_ctl, path,
					    cache->key.objectid);
			btrfs_put_block_group(cache);
		}


		/*
		 * btrfs_wait_cache_io uses the cache->dirty_list to decide
		 * if it should update the cache_state.  Don't delete
		 * until after we wait.
		 *
		 * Since we're not running in the commit critical section
		 * we need the dirty_bgs_lock to protect from update_block_group
		 */
		spin_lock(&cur_trans->dirty_bgs_lock);
		list_del_init(&cache->dirty_list);
		spin_unlock(&cur_trans->dirty_bgs_lock);

		should_put = 1;

		cache_save_setup(cache, trans, path);

		if (cache->disk_cache_state == BTRFS_DC_SETUP) {
			cache->io_ctl.inode = NULL;
			ret = btrfs_write_out_cache(root, trans, cache, path);
			if (ret == 0 && cache->io_ctl.inode) {
				num_started++;
				should_put = 0;

				/*
				 * the cache_write_mutex is protecting
				 * the io_list
				 */
				list_add_tail(&cache->io_list, io);
			} else {
				/*
				 * if we failed to write the cache, the
				 * generation will be bad and life goes on
				 */
				ret = 0;
			}
		}
3637
		if (!ret) {
3638
			ret = write_one_cache_group(trans, root, path, cache);
3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657
			/*
			 * Our block group might still be attached to the list
			 * of new block groups in the transaction handle of some
			 * other task (struct btrfs_trans_handle->new_bgs). This
			 * means its block group item isn't yet in the extent
			 * tree. If this happens ignore the error, as we will
			 * try again later in the critical section of the
			 * transaction commit.
			 */
			if (ret == -ENOENT) {
				ret = 0;
				spin_lock(&cur_trans->dirty_bgs_lock);
				if (list_empty(&cache->dirty_list)) {
					list_add_tail(&cache->dirty_list,
						      &cur_trans->dirty_bgs);
					btrfs_get_block_group(cache);
				}
				spin_unlock(&cur_trans->dirty_bgs_lock);
			} else if (ret) {
3658
				btrfs_abort_transaction(trans, ret);
3659 3660
			}
		}
3661 3662 3663 3664 3665 3666 3667

		/* if its not on the io list, we need to put the block group */
		if (should_put)
			btrfs_put_block_group(cache);

		if (ret)
			break;
3668 3669 3670 3671 3672 3673 3674 3675

		/*
		 * Avoid blocking other tasks for too long. It might even save
		 * us from writing caches for block groups that are going to be
		 * removed.
		 */
		mutex_unlock(&trans->transaction->cache_write_mutex);
		mutex_lock(&trans->transaction->cache_write_mutex);
3676
	}
3677
	mutex_unlock(&trans->transaction->cache_write_mutex);
3678 3679 3680 3681 3682 3683 3684 3685 3686 3687

	/*
	 * go through delayed refs for all the stuff we've just kicked off
	 * and then loop back (just once)
	 */
	ret = btrfs_run_delayed_refs(trans, root, 0);
	if (!ret && loops == 0) {
		loops++;
		spin_lock(&cur_trans->dirty_bgs_lock);
		list_splice_init(&cur_trans->dirty_bgs, &dirty);
3688 3689 3690 3691 3692 3693 3694 3695
		/*
		 * dirty_bgs_lock protects us from concurrent block group
		 * deletes too (not just cache_write_mutex).
		 */
		if (!list_empty(&dirty)) {
			spin_unlock(&cur_trans->dirty_bgs_lock);
			goto again;
		}
3696
		spin_unlock(&cur_trans->dirty_bgs_lock);
3697 3698
	} else if (ret < 0) {
		btrfs_cleanup_dirty_bgs(cur_trans, root);
3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714
	}

	btrfs_free_path(path);
	return ret;
}

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root)
{
	struct btrfs_block_group_cache *cache;
	struct btrfs_transaction *cur_trans = trans->transaction;
	int ret = 0;
	int should_put;
	struct btrfs_path *path;
	struct list_head *io = &cur_trans->io_bgs;
	int num_started = 0;
C
Chris Mason 已提交
3715 3716 3717 3718 3719

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3720
	/*
3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731
	 * Even though we are in the critical section of the transaction commit,
	 * we can still have concurrent tasks adding elements to this
	 * transaction's list of dirty block groups. These tasks correspond to
	 * endio free space workers started when writeback finishes for a
	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
	 * allocate new block groups as a result of COWing nodes of the root
	 * tree when updating the free space inode. The writeback for the space
	 * caches is triggered by an earlier call to
	 * btrfs_start_dirty_block_groups() and iterations of the following
	 * loop.
	 * Also we want to do the cache_save_setup first and then run the
3732 3733 3734
	 * delayed refs to make sure we have the best chance at doing this all
	 * in one shot.
	 */
3735
	spin_lock(&cur_trans->dirty_bgs_lock);
3736 3737 3738 3739
	while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
					 struct btrfs_block_group_cache,
					 dirty_list);
3740 3741 3742 3743 3744 3745 3746

		/*
		 * this can happen if cache_save_setup re-dirties a block
		 * group that is already under IO.  Just wait for it to
		 * finish and then do it all again
		 */
		if (!list_empty(&cache->io_list)) {
3747
			spin_unlock(&cur_trans->dirty_bgs_lock);
3748 3749 3750 3751 3752
			list_del_init(&cache->io_list);
			btrfs_wait_cache_io(root, trans, cache,
					    &cache->io_ctl, path,
					    cache->key.objectid);
			btrfs_put_block_group(cache);
3753
			spin_lock(&cur_trans->dirty_bgs_lock);
3754 3755
		}

3756 3757 3758 3759
		/*
		 * don't remove from the dirty list until after we've waited
		 * on any pending IO
		 */
3760
		list_del_init(&cache->dirty_list);
3761
		spin_unlock(&cur_trans->dirty_bgs_lock);
3762 3763
		should_put = 1;

3764
		cache_save_setup(cache, trans, path);
3765

3766
		if (!ret)
3767 3768 3769 3770 3771 3772 3773 3774
			ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);

		if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
			cache->io_ctl.inode = NULL;
			ret = btrfs_write_out_cache(root, trans, cache, path);
			if (ret == 0 && cache->io_ctl.inode) {
				num_started++;
				should_put = 0;
3775
				list_add_tail(&cache->io_list, io);
3776 3777 3778 3779 3780 3781 3782 3783
			} else {
				/*
				 * if we failed to write the cache, the
				 * generation will be bad and life goes on
				 */
				ret = 0;
			}
		}
3784
		if (!ret) {
3785
			ret = write_one_cache_group(trans, root, path, cache);
3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804
			/*
			 * One of the free space endio workers might have
			 * created a new block group while updating a free space
			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
			 * and hasn't released its transaction handle yet, in
			 * which case the new block group is still attached to
			 * its transaction handle and its creation has not
			 * finished yet (no block group item in the extent tree
			 * yet, etc). If this is the case, wait for all free
			 * space endio workers to finish and retry. This is a
			 * a very rare case so no need for a more efficient and
			 * complex approach.
			 */
			if (ret == -ENOENT) {
				wait_event(cur_trans->writer_wait,
				   atomic_read(&cur_trans->num_writers) == 1);
				ret = write_one_cache_group(trans, root, path,
							    cache);
			}
3805
			if (ret)
3806
				btrfs_abort_transaction(trans, ret);
3807
		}
3808 3809 3810 3811

		/* if its not on the io list, we need to put the block group */
		if (should_put)
			btrfs_put_block_group(cache);
3812
		spin_lock(&cur_trans->dirty_bgs_lock);
3813
	}
3814
	spin_unlock(&cur_trans->dirty_bgs_lock);
3815

3816 3817
	while (!list_empty(io)) {
		cache = list_first_entry(io, struct btrfs_block_group_cache,
3818 3819 3820 3821
					 io_list);
		list_del_init(&cache->io_list);
		btrfs_wait_cache_io(root, trans, cache,
				    &cache->io_ctl, path, cache->key.objectid);
J
Josef Bacik 已提交
3822 3823 3824
		btrfs_put_block_group(cache);
	}

C
Chris Mason 已提交
3825
	btrfs_free_path(path);
3826
	return ret;
C
Chris Mason 已提交
3827 3828
}

3829 3830 3831 3832 3833 3834 3835 3836 3837
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
{
	struct btrfs_block_group_cache *block_group;
	int readonly = 0;

	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
	if (!block_group || block_group->ro)
		readonly = 1;
	if (block_group)
3838
		btrfs_put_block_group(block_group);
3839 3840 3841
	return readonly;
}

3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
	struct btrfs_block_group_cache *bg;
	bool ret = true;

	bg = btrfs_lookup_block_group(fs_info, bytenr);
	if (!bg)
		return false;

	spin_lock(&bg->lock);
	if (bg->ro)
		ret = false;
	else
		atomic_inc(&bg->nocow_writers);
	spin_unlock(&bg->lock);

	/* no put on block group, done by btrfs_dec_nocow_writers */
	if (!ret)
		btrfs_put_block_group(bg);

	return ret;

}

void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
{
	struct btrfs_block_group_cache *bg;

	bg = btrfs_lookup_block_group(fs_info, bytenr);
	ASSERT(bg);
	if (atomic_dec_and_test(&bg->nocow_writers))
		wake_up_atomic_t(&bg->nocow_writers);
	/*
	 * Once for our lookup and once for the lookup done by a previous call
	 * to btrfs_inc_nocow_writers()
	 */
	btrfs_put_block_group(bg);
	btrfs_put_block_group(bg);
}

static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
{
	schedule();
	return 0;
}

void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
{
	wait_on_atomic_t(&bg->nocow_writers,
			 btrfs_wait_nocow_writers_atomic_t,
			 TASK_UNINTERRUPTIBLE);
}

3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911
static const char *alloc_name(u64 flags)
{
	switch (flags) {
	case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
		return "mixed";
	case BTRFS_BLOCK_GROUP_METADATA:
		return "metadata";
	case BTRFS_BLOCK_GROUP_DATA:
		return "data";
	case BTRFS_BLOCK_GROUP_SYSTEM:
		return "system";
	default:
		WARN_ON(1);
		return "invalid-combination";
	};
}

3912 3913
static int update_space_info(struct btrfs_fs_info *info, u64 flags,
			     u64 total_bytes, u64 bytes_used,
3914
			     u64 bytes_readonly,
3915 3916 3917
			     struct btrfs_space_info **space_info)
{
	struct btrfs_space_info *found;
3918 3919
	int i;
	int factor;
3920
	int ret;
3921 3922 3923 3924 3925 3926

	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
		     BTRFS_BLOCK_GROUP_RAID10))
		factor = 2;
	else
		factor = 1;
3927 3928 3929

	found = __find_space_info(info, flags);
	if (found) {
3930
		spin_lock(&found->lock);
3931
		found->total_bytes += total_bytes;
J
Josef Bacik 已提交
3932
		found->disk_total += total_bytes * factor;
3933
		found->bytes_used += bytes_used;
3934
		found->disk_used += bytes_used * factor;
3935
		found->bytes_readonly += bytes_readonly;
3936 3937
		if (total_bytes > 0)
			found->full = 0;
3938 3939
		space_info_add_new_bytes(info, found, total_bytes -
					 bytes_used - bytes_readonly);
3940
		spin_unlock(&found->lock);
3941 3942 3943
		*space_info = found;
		return 0;
	}
Y
Yan Zheng 已提交
3944
	found = kzalloc(sizeof(*found), GFP_NOFS);
3945 3946 3947
	if (!found)
		return -ENOMEM;

3948
	ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3949 3950 3951 3952 3953
	if (ret) {
		kfree(found);
		return ret;
	}

3954
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3955
		INIT_LIST_HEAD(&found->block_groups[i]);
3956
	init_rwsem(&found->groups_sem);
J
Josef Bacik 已提交
3957
	spin_lock_init(&found->lock);
3958
	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3959
	found->total_bytes = total_bytes;
J
Josef Bacik 已提交
3960
	found->disk_total = total_bytes * factor;
3961
	found->bytes_used = bytes_used;
3962
	found->disk_used = bytes_used * factor;
3963
	found->bytes_pinned = 0;
3964
	found->bytes_reserved = 0;
3965
	found->bytes_readonly = bytes_readonly;
3966
	found->bytes_may_use = 0;
3967
	found->full = 0;
3968
	found->max_extent_size = 0;
3969
	found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3970
	found->chunk_alloc = 0;
3971 3972
	found->flush = 0;
	init_waitqueue_head(&found->wait);
3973
	INIT_LIST_HEAD(&found->ro_bgs);
3974 3975
	INIT_LIST_HEAD(&found->tickets);
	INIT_LIST_HEAD(&found->priority_tickets);
3976 3977 3978 3979 3980 3981 3982 3983 3984

	ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
				    info->space_info_kobj, "%s",
				    alloc_name(found->flags));
	if (ret) {
		kfree(found);
		return ret;
	}

3985
	*space_info = found;
3986
	list_add_rcu(&found->list, &info->space_info);
3987 3988
	if (flags & BTRFS_BLOCK_GROUP_DATA)
		info->data_sinfo = found;
3989 3990

	return ret;
3991 3992
}

3993 3994
static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
3995 3996
	u64 extra_flags = chunk_to_extended(flags) &
				BTRFS_EXTENDED_PROFILE_MASK;
3997

3998
	write_seqlock(&fs_info->profiles_lock);
3999 4000 4001 4002 4003 4004
	if (flags & BTRFS_BLOCK_GROUP_DATA)
		fs_info->avail_data_alloc_bits |= extra_flags;
	if (flags & BTRFS_BLOCK_GROUP_METADATA)
		fs_info->avail_metadata_alloc_bits |= extra_flags;
	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
		fs_info->avail_system_alloc_bits |= extra_flags;
4005
	write_sequnlock(&fs_info->profiles_lock);
4006
}
4007

4008 4009 4010
/*
 * returns target flags in extended format or 0 if restripe for this
 * chunk_type is not in progress
4011 4012
 *
 * should be called with either volume_mutex or balance_lock held
4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035
 */
static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
	u64 target = 0;

	if (!bctl)
		return 0;

	if (flags & BTRFS_BLOCK_GROUP_DATA &&
	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
	}

	return target;
}

4036 4037 4038
/*
 * @flags: available profiles in extended format (see ctree.h)
 *
4039 4040 4041
 * Returns reduced profile in chunk format.  If profile changing is in
 * progress (either running or paused) picks the target profile (if it's
 * already available), otherwise falls back to plain reducing.
4042
 */
4043
static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
4044
{
4045
	u64 num_devices = root->fs_info->fs_devices->rw_devices;
4046
	u64 target;
4047 4048
	u64 raid_type;
	u64 allowed = 0;
4049

4050 4051 4052 4053
	/*
	 * see if restripe for this chunk_type is in progress, if so
	 * try to reduce to the target profile
	 */
4054
	spin_lock(&root->fs_info->balance_lock);
4055 4056 4057 4058
	target = get_restripe_target(root->fs_info, flags);
	if (target) {
		/* pick target profile only if it's already available */
		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
4059
			spin_unlock(&root->fs_info->balance_lock);
4060
			return extended_to_chunk(target);
4061 4062 4063 4064
		}
	}
	spin_unlock(&root->fs_info->balance_lock);

D
David Woodhouse 已提交
4065
	/* First, mask out the RAID levels which aren't possible */
4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (num_devices >= btrfs_raid_array[raid_type].devs_min)
			allowed |= btrfs_raid_group[raid_type];
	}
	allowed &= flags;

	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
		allowed = BTRFS_BLOCK_GROUP_RAID6;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
		allowed = BTRFS_BLOCK_GROUP_RAID5;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
		allowed = BTRFS_BLOCK_GROUP_RAID10;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
		allowed = BTRFS_BLOCK_GROUP_RAID1;
	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
		allowed = BTRFS_BLOCK_GROUP_RAID0;

	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;

	return extended_to_chunk(flags | allowed);
4086 4087
}

4088
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
J
Josef Bacik 已提交
4089
{
4090
	unsigned seq;
4091
	u64 flags;
4092 4093

	do {
4094
		flags = orig_flags;
4095 4096 4097 4098 4099 4100 4101 4102 4103
		seq = read_seqbegin(&root->fs_info->profiles_lock);

		if (flags & BTRFS_BLOCK_GROUP_DATA)
			flags |= root->fs_info->avail_data_alloc_bits;
		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
			flags |= root->fs_info->avail_system_alloc_bits;
		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
			flags |= root->fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
4104

4105
	return btrfs_reduce_alloc_profile(root, flags);
J
Josef Bacik 已提交
4106 4107
}

4108
u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
J
Josef Bacik 已提交
4109
{
4110
	u64 flags;
D
David Woodhouse 已提交
4111
	u64 ret;
J
Josef Bacik 已提交
4112

4113 4114 4115 4116
	if (data)
		flags = BTRFS_BLOCK_GROUP_DATA;
	else if (root == root->fs_info->chunk_root)
		flags = BTRFS_BLOCK_GROUP_SYSTEM;
J
Josef Bacik 已提交
4117
	else
4118
		flags = BTRFS_BLOCK_GROUP_METADATA;
J
Josef Bacik 已提交
4119

D
David Woodhouse 已提交
4120 4121
	ret = get_alloc_profile(root, flags);
	return ret;
J
Josef Bacik 已提交
4122
}
J
Josef Bacik 已提交
4123

4124
int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
J
Josef Bacik 已提交
4125 4126
{
	struct btrfs_space_info *data_sinfo;
4127
	struct btrfs_root *root = BTRFS_I(inode)->root;
4128
	struct btrfs_fs_info *fs_info = root->fs_info;
4129
	u64 used;
4130
	int ret = 0;
4131 4132
	int need_commit = 2;
	int have_pinned_space;
J
Josef Bacik 已提交
4133 4134

	/* make sure bytes are sectorsize aligned */
4135
	bytes = ALIGN(bytes, root->sectorsize);
J
Josef Bacik 已提交
4136

4137
	if (btrfs_is_free_space_inode(inode)) {
4138
		need_commit = 0;
4139
		ASSERT(current->journal_info);
4140 4141
	}

4142
	data_sinfo = fs_info->data_sinfo;
C
Chris Mason 已提交
4143 4144
	if (!data_sinfo)
		goto alloc;
J
Josef Bacik 已提交
4145

J
Josef Bacik 已提交
4146 4147 4148
again:
	/* make sure we have enough space to handle the data first */
	spin_lock(&data_sinfo->lock);
4149 4150 4151
	used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
		data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
		data_sinfo->bytes_may_use;
4152 4153

	if (used + bytes > data_sinfo->total_bytes) {
4154
		struct btrfs_trans_handle *trans;
J
Josef Bacik 已提交
4155

J
Josef Bacik 已提交
4156 4157 4158 4159
		/*
		 * if we don't have enough free bytes in this space then we need
		 * to alloc a new chunk.
		 */
4160
		if (!data_sinfo->full) {
J
Josef Bacik 已提交
4161
			u64 alloc_target;
J
Josef Bacik 已提交
4162

4163
			data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
J
Josef Bacik 已提交
4164
			spin_unlock(&data_sinfo->lock);
C
Chris Mason 已提交
4165
alloc:
J
Josef Bacik 已提交
4166
			alloc_target = btrfs_get_alloc_profile(root, 1);
4167 4168 4169 4170 4171 4172 4173 4174 4175 4176
			/*
			 * It is ugly that we don't call nolock join
			 * transaction for the free space inode case here.
			 * But it is safe because we only do the data space
			 * reservation for the free space cache in the
			 * transaction context, the common join transaction
			 * just increase the counter of the current transaction
			 * handler, doesn't try to acquire the trans_lock of
			 * the fs.
			 */
4177
			trans = btrfs_join_transaction(root);
4178 4179
			if (IS_ERR(trans))
				return PTR_ERR(trans);
J
Josef Bacik 已提交
4180

J
Josef Bacik 已提交
4181
			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4182 4183
					     alloc_target,
					     CHUNK_ALLOC_NO_FORCE);
J
Josef Bacik 已提交
4184
			btrfs_end_transaction(trans, root);
4185 4186 4187
			if (ret < 0) {
				if (ret != -ENOSPC)
					return ret;
4188 4189
				else {
					have_pinned_space = 1;
4190
					goto commit_trans;
4191
				}
4192
			}
J
Josef Bacik 已提交
4193

4194 4195 4196
			if (!data_sinfo)
				data_sinfo = fs_info->data_sinfo;

J
Josef Bacik 已提交
4197 4198
			goto again;
		}
4199 4200

		/*
4201
		 * If we don't have enough pinned space to deal with this
4202 4203
		 * allocation, and no removed chunk in current transaction,
		 * don't bother committing the transaction.
4204
		 */
4205 4206 4207
		have_pinned_space = percpu_counter_compare(
			&data_sinfo->total_bytes_pinned,
			used + bytes - data_sinfo->total_bytes);
J
Josef Bacik 已提交
4208 4209
		spin_unlock(&data_sinfo->lock);

4210
		/* commit the current transaction and try again */
4211
commit_trans:
4212
		if (need_commit &&
J
Josef Bacik 已提交
4213
		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
4214
			need_commit--;
4215

4216 4217
			if (need_commit > 0) {
				btrfs_start_delalloc_roots(fs_info, 0, -1);
4218
				btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4219
			}
4220

4221
			trans = btrfs_join_transaction(root);
4222 4223
			if (IS_ERR(trans))
				return PTR_ERR(trans);
4224
			if (have_pinned_space >= 0 ||
4225 4226
			    test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
				     &trans->transaction->flags) ||
4227
			    need_commit > 0) {
4228 4229 4230
				ret = btrfs_commit_transaction(trans, root);
				if (ret)
					return ret;
4231
				/*
4232 4233 4234
				 * The cleaner kthread might still be doing iput
				 * operations. Wait for it to finish so that
				 * more space is released.
4235
				 */
4236 4237
				mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex);
				mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex);
4238 4239 4240 4241
				goto again;
			} else {
				btrfs_end_transaction(trans, root);
			}
4242
		}
J
Josef Bacik 已提交
4243

4244 4245 4246
		trace_btrfs_space_reservation(root->fs_info,
					      "space_info:enospc",
					      data_sinfo->flags, bytes, 1);
J
Josef Bacik 已提交
4247 4248 4249
		return -ENOSPC;
	}
	data_sinfo->bytes_may_use += bytes;
J
Josef Bacik 已提交
4250
	trace_btrfs_space_reservation(root->fs_info, "space_info",
4251
				      data_sinfo->flags, bytes, 1);
J
Josef Bacik 已提交
4252 4253
	spin_unlock(&data_sinfo->lock);

4254
	return ret;
J
Josef Bacik 已提交
4255
}
J
Josef Bacik 已提交
4256

4257 4258 4259 4260 4261
/*
 * New check_data_free_space() with ability for precious data reservation
 * Will replace old btrfs_check_data_free_space(), but for patch split,
 * add a new function first and then replace it.
 */
4262
int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	int ret;

	/* align the range */
	len = round_up(start + len, root->sectorsize) -
	      round_down(start, root->sectorsize);
	start = round_down(start, root->sectorsize);

	ret = btrfs_alloc_data_chunk_ondemand(inode, len);
	if (ret < 0)
		return ret;

4276
	/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
4277
	ret = btrfs_qgroup_reserve_data(inode, start, len);
4278 4279
	if (ret)
		btrfs_free_reserved_data_space_noquota(inode, start, len);
4280 4281 4282 4283 4284 4285 4286
	return ret;
}

/*
 * Called if we need to clear a data reservation for this inode
 * Normally in a error case.
 *
4287 4288 4289
 * This one will *NOT* use accurate qgroup reserved space API, just for case
 * which we can't sleep and is sure it won't affect qgroup reserved space.
 * Like clear_bit_hook().
4290
 */
4291 4292
void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
					    u64 len)
4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_space_info *data_sinfo;

	/* Make sure the range is aligned to sectorsize */
	len = round_up(start + len, root->sectorsize) -
	      round_down(start, root->sectorsize);
	start = round_down(start, root->sectorsize);

	data_sinfo = root->fs_info->data_sinfo;
	spin_lock(&data_sinfo->lock);
	if (WARN_ON(data_sinfo->bytes_may_use < len))
		data_sinfo->bytes_may_use = 0;
	else
		data_sinfo->bytes_may_use -= len;
	trace_btrfs_space_reservation(root->fs_info, "space_info",
				      data_sinfo->flags, len, 0);
	spin_unlock(&data_sinfo->lock);
}

4313 4314 4315 4316
/*
 * Called if we need to clear a data reservation for this inode
 * Normally in a error case.
 *
4317
 * This one will handle the per-inode data rsv map for accurate reserved
4318 4319 4320 4321 4322 4323 4324 4325
 * space framework.
 */
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
{
	btrfs_free_reserved_data_space_noquota(inode, start, len);
	btrfs_qgroup_free_data(inode, start, len);
}

4326
static void force_metadata_allocation(struct btrfs_fs_info *info)
4327
{
4328 4329
	struct list_head *head = &info->space_info;
	struct btrfs_space_info *found;
4330

4331 4332 4333
	rcu_read_lock();
	list_for_each_entry_rcu(found, head, list) {
		if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
4334
			found->force_alloc = CHUNK_ALLOC_FORCE;
4335
	}
4336
	rcu_read_unlock();
4337 4338
}

4339 4340 4341 4342 4343
static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
{
	return (global->size << 1);
}

4344
static int should_alloc_chunk(struct btrfs_root *root,
4345
			      struct btrfs_space_info *sinfo, int force)
4346
{
4347
	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4348
	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
4349
	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
4350
	u64 thresh;
4351

4352 4353 4354
	if (force == CHUNK_ALLOC_FORCE)
		return 1;

4355 4356 4357 4358 4359
	/*
	 * We need to take into account the global rsv because for all intents
	 * and purposes it's used space.  Don't worry about locking the
	 * global_rsv, it doesn't change except when the transaction commits.
	 */
4360
	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
4361
		num_allocated += calc_global_rsv_need_space(global_rsv);
4362

4363 4364 4365 4366 4367
	/*
	 * in limited mode, we want to have some free space up to
	 * about 1% of the FS size.
	 */
	if (force == CHUNK_ALLOC_LIMITED) {
4368
		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
4369
		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
4370 4371 4372 4373 4374

		if (num_bytes - num_allocated < thresh)
			return 1;
	}

4375
	if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
4376
		return 0;
4377
	return 1;
4378 4379
}

4380
static u64 get_profile_num_devs(struct btrfs_root *root, u64 type)
4381 4382 4383
{
	u64 num_dev;

D
David Woodhouse 已提交
4384 4385 4386 4387
	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
		    BTRFS_BLOCK_GROUP_RAID0 |
		    BTRFS_BLOCK_GROUP_RAID5 |
		    BTRFS_BLOCK_GROUP_RAID6))
4388 4389 4390 4391 4392 4393
		num_dev = root->fs_info->fs_devices->rw_devices;
	else if (type & BTRFS_BLOCK_GROUP_RAID1)
		num_dev = 2;
	else
		num_dev = 1;	/* DUP or single */

4394
	return num_dev;
4395 4396
}

4397 4398 4399 4400 4401 4402 4403
/*
 * If @is_allocation is true, reserve space in the system space info necessary
 * for allocating a chunk, otherwise if it's false, reserve space necessary for
 * removing a chunk.
 */
void check_system_chunk(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
4404
			u64 type)
4405 4406 4407 4408
{
	struct btrfs_space_info *info;
	u64 left;
	u64 thresh;
4409
	int ret = 0;
4410
	u64 num_devs;
4411 4412 4413 4414 4415 4416

	/*
	 * Needed because we can end up allocating a system chunk and for an
	 * atomic and race free space reservation in the chunk block reserve.
	 */
	ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex));
4417 4418 4419 4420

	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
	spin_lock(&info->lock);
	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
4421 4422
		info->bytes_reserved - info->bytes_readonly -
		info->bytes_may_use;
4423 4424
	spin_unlock(&info->lock);

4425 4426 4427
	num_devs = get_profile_num_devs(root, type);

	/* num_devs device items to update and 1 chunk item to add or remove */
4428 4429
	thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
		btrfs_calc_trans_metadata_size(root, 1);
4430

4431
	if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
4432 4433
		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
			left, thresh, type);
4434 4435 4436 4437 4438 4439 4440
		dump_space_info(info, 0, 0);
	}

	if (left < thresh) {
		u64 flags;

		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455
		/*
		 * Ignore failure to create system chunk. We might end up not
		 * needing it, as we might not need to COW all nodes/leafs from
		 * the paths we visit in the chunk tree (they were already COWed
		 * or created in the current transaction for example).
		 */
		ret = btrfs_alloc_chunk(trans, root, flags);
	}

	if (!ret) {
		ret = btrfs_block_rsv_add(root->fs_info->chunk_root,
					  &root->fs_info->chunk_block_rsv,
					  thresh, BTRFS_RESERVE_NO_FLUSH);
		if (!ret)
			trans->chunk_bytes_reserved += thresh;
4456 4457 4458
	}
}

4459 4460 4461 4462 4463 4464 4465 4466 4467
/*
 * If force is CHUNK_ALLOC_FORCE:
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 * If force is NOT CHUNK_ALLOC_FORCE:
 *    - return 0 if it doesn't need to allocate a new chunk,
 *    - return 1 if it successfully allocates a chunk,
 *    - return errors including -ENOSPC otherwise.
 */
4468
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
4469
			  struct btrfs_root *extent_root, u64 flags, int force)
J
Josef Bacik 已提交
4470
{
4471
	struct btrfs_space_info *space_info;
4472
	struct btrfs_fs_info *fs_info = extent_root->fs_info;
4473
	int wait_for_alloc = 0;
J
Josef Bacik 已提交
4474 4475
	int ret = 0;

4476 4477 4478 4479
	/* Don't re-enter if we're already allocating a chunk */
	if (trans->allocating_chunk)
		return -ENOSPC;

4480
	space_info = __find_space_info(extent_root->fs_info, flags);
4481 4482
	if (!space_info) {
		ret = update_space_info(extent_root->fs_info, flags,
4483
					0, 0, 0, &space_info);
4484
		BUG_ON(ret); /* -ENOMEM */
J
Josef Bacik 已提交
4485
	}
4486
	BUG_ON(!space_info); /* Logic error */
J
Josef Bacik 已提交
4487

4488
again:
4489
	spin_lock(&space_info->lock);
4490
	if (force < space_info->force_alloc)
4491
		force = space_info->force_alloc;
4492
	if (space_info->full) {
4493 4494 4495 4496
		if (should_alloc_chunk(extent_root, space_info, force))
			ret = -ENOSPC;
		else
			ret = 0;
4497
		spin_unlock(&space_info->lock);
4498
		return ret;
J
Josef Bacik 已提交
4499 4500
	}

4501
	if (!should_alloc_chunk(extent_root, space_info, force)) {
4502
		spin_unlock(&space_info->lock);
4503 4504 4505 4506 4507
		return 0;
	} else if (space_info->chunk_alloc) {
		wait_for_alloc = 1;
	} else {
		space_info->chunk_alloc = 1;
J
Josef Bacik 已提交
4508
	}
4509

4510
	spin_unlock(&space_info->lock);
J
Josef Bacik 已提交
4511

4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525
	mutex_lock(&fs_info->chunk_mutex);

	/*
	 * The chunk_mutex is held throughout the entirety of a chunk
	 * allocation, so once we've acquired the chunk_mutex we know that the
	 * other guy is done and we need to recheck and see if we should
	 * allocate.
	 */
	if (wait_for_alloc) {
		mutex_unlock(&fs_info->chunk_mutex);
		wait_for_alloc = 0;
		goto again;
	}

4526 4527
	trans->allocating_chunk = true;

4528 4529 4530 4531 4532 4533 4534
	/*
	 * If we have mixed data/metadata chunks we want to make sure we keep
	 * allocating mixed chunks instead of individual chunks.
	 */
	if (btrfs_mixed_space_info(space_info))
		flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);

4535 4536 4537 4538 4539
	/*
	 * if we're doing a data chunk, go ahead and make sure that
	 * we keep a reasonable number of metadata chunks allocated in the
	 * FS as well.
	 */
J
Josef Bacik 已提交
4540
	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
4541 4542 4543 4544
		fs_info->data_chunk_allocations++;
		if (!(fs_info->data_chunk_allocations %
		      fs_info->metadata_ratio))
			force_metadata_allocation(fs_info);
J
Josef Bacik 已提交
4545 4546
	}

4547 4548 4549 4550
	/*
	 * Check if we have enough space in SYSTEM chunk because we may need
	 * to update devices.
	 */
4551
	check_system_chunk(trans, extent_root, flags);
4552

Y
Yan Zheng 已提交
4553
	ret = btrfs_alloc_chunk(trans, extent_root, flags);
4554
	trans->allocating_chunk = false;
4555

J
Josef Bacik 已提交
4556
	spin_lock(&space_info->lock);
4557 4558
	if (ret < 0 && ret != -ENOSPC)
		goto out;
J
Josef Bacik 已提交
4559
	if (ret)
4560
		space_info->full = 1;
4561 4562
	else
		ret = 1;
4563

4564
	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
4565
out:
4566
	space_info->chunk_alloc = 0;
J
Josef Bacik 已提交
4567
	spin_unlock(&space_info->lock);
4568
	mutex_unlock(&fs_info->chunk_mutex);
4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582
	/*
	 * When we allocate a new chunk we reserve space in the chunk block
	 * reserve to make sure we can COW nodes/leafs in the chunk tree or
	 * add new nodes/leafs to it if we end up needing to do it when
	 * inserting the chunk item and updating device items as part of the
	 * second phase of chunk allocation, performed by
	 * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a
	 * large number of new block groups to create in our transaction
	 * handle's new_bgs list to avoid exhausting the chunk block reserve
	 * in extreme cases - like having a single transaction create many new
	 * block groups when starting to write out the free space caches of all
	 * the block groups that were made dirty during the lifetime of the
	 * transaction.
	 */
4583
	if (trans->can_flush_pending_bgs &&
4584
	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
4585
		btrfs_create_pending_block_groups(trans, extent_root);
4586 4587
		btrfs_trans_release_chunk_metadata(trans);
	}
J
Josef Bacik 已提交
4588
	return ret;
4589
}
J
Josef Bacik 已提交
4590

J
Josef Bacik 已提交
4591 4592
static int can_overcommit(struct btrfs_root *root,
			  struct btrfs_space_info *space_info, u64 bytes,
M
Miao Xie 已提交
4593
			  enum btrfs_reserve_flush_enum flush)
J
Josef Bacik 已提交
4594
{
4595 4596
	struct btrfs_block_rsv *global_rsv;
	u64 profile;
4597
	u64 space_size;
J
Josef Bacik 已提交
4598 4599 4600
	u64 avail;
	u64 used;

4601 4602 4603 4604 4605 4606 4607
	/* Don't overcommit when in mixed mode. */
	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
		return 0;

	BUG_ON(root->fs_info == NULL);
	global_rsv = &root->fs_info->global_block_rsv;
	profile = btrfs_get_alloc_profile(root, 0);
J
Josef Bacik 已提交
4608
	used = space_info->bytes_used + space_info->bytes_reserved +
4609 4610 4611 4612 4613 4614 4615 4616
		space_info->bytes_pinned + space_info->bytes_readonly;

	/*
	 * We only want to allow over committing if we have lots of actual space
	 * free, but if we don't have enough space to handle the global reserve
	 * space then we could end up having a real enospc problem when trying
	 * to allocate a chunk or some other such important allocation.
	 */
4617 4618 4619 4620
	spin_lock(&global_rsv->lock);
	space_size = calc_global_rsv_need_space(global_rsv);
	spin_unlock(&global_rsv->lock);
	if (used + space_size >= space_info->total_bytes)
4621 4622 4623
		return 0;

	used += space_info->bytes_may_use;
J
Josef Bacik 已提交
4624 4625 4626 4627 4628 4629 4630

	spin_lock(&root->fs_info->free_chunk_lock);
	avail = root->fs_info->free_chunk_space;
	spin_unlock(&root->fs_info->free_chunk_lock);

	/*
	 * If we have dup, raid1 or raid10 then only half of the free
D
David Woodhouse 已提交
4631 4632 4633
	 * space is actually useable.  For raid56, the space info used
	 * doesn't include the parity drive, so we don't have to
	 * change the math
J
Josef Bacik 已提交
4634 4635 4636 4637 4638 4639 4640
	 */
	if (profile & (BTRFS_BLOCK_GROUP_DUP |
		       BTRFS_BLOCK_GROUP_RAID1 |
		       BTRFS_BLOCK_GROUP_RAID10))
		avail >>= 1;

	/*
4641 4642 4643
	 * If we aren't flushing all things, let us overcommit up to
	 * 1/2th of the space. If we can flush, don't let us overcommit
	 * too much, let it overcommit up to 1/8 of the space.
J
Josef Bacik 已提交
4644
	 */
M
Miao Xie 已提交
4645
	if (flush == BTRFS_RESERVE_FLUSH_ALL)
4646
		avail >>= 3;
J
Josef Bacik 已提交
4647
	else
4648
		avail >>= 1;
J
Josef Bacik 已提交
4649

4650
	if (used + bytes < space_info->total_bytes + avail)
J
Josef Bacik 已提交
4651 4652 4653 4654
		return 1;
	return 0;
}

4655
static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4656
					 unsigned long nr_pages, int nr_items)
4657 4658 4659
{
	struct super_block *sb = root->fs_info->sb;

4660 4661 4662 4663
	if (down_read_trylock(&sb->s_umount)) {
		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
		up_read(&sb->s_umount);
	} else {
4664 4665 4666 4667 4668 4669 4670
		/*
		 * We needn't worry the filesystem going from r/w to r/o though
		 * we don't acquire ->s_umount mutex, because the filesystem
		 * should guarantee the delalloc inodes list be empty after
		 * the filesystem is readonly(all dirty pages are written to
		 * the disk).
		 */
4671
		btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4672
		if (!current->journal_info)
4673 4674
			btrfs_wait_ordered_roots(root->fs_info, nr_items,
						 0, (u64)-1);
4675 4676 4677
	}
}

4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689
static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
{
	u64 bytes;
	int nr;

	bytes = btrfs_calc_trans_metadata_size(root, 1);
	nr = (int)div64_u64(to_reclaim, bytes);
	if (!nr)
		nr = 1;
	return nr;
}

4690
#define EXTENT_SIZE_PER_ITEM	SZ_256K
4691

J
Josef Bacik 已提交
4692
/*
4693
 * shrink metadata reservation for delalloc
J
Josef Bacik 已提交
4694
 */
J
Josef Bacik 已提交
4695 4696
static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
			    bool wait_ordered)
4697
{
4698
	struct btrfs_block_rsv *block_rsv;
J
Josef Bacik 已提交
4699
	struct btrfs_space_info *space_info;
4700
	struct btrfs_trans_handle *trans;
J
Josef Bacik 已提交
4701
	u64 delalloc_bytes;
4702
	u64 max_reclaim;
4703
	long time_left;
4704 4705
	unsigned long nr_pages;
	int loops;
4706
	int items;
M
Miao Xie 已提交
4707
	enum btrfs_reserve_flush_enum flush;
4708

4709
	/* Calc the number of the pages we need flush for space reservation */
4710
	items = calc_reclaim_items_nr(root, to_reclaim);
4711
	to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4712

4713
	trans = (struct btrfs_trans_handle *)current->journal_info;
4714
	block_rsv = &root->fs_info->delalloc_block_rsv;
J
Josef Bacik 已提交
4715
	space_info = block_rsv->space_info;
4716

4717 4718
	delalloc_bytes = percpu_counter_sum_positive(
						&root->fs_info->delalloc_bytes);
J
Josef Bacik 已提交
4719
	if (delalloc_bytes == 0) {
4720
		if (trans)
J
Josef Bacik 已提交
4721
			return;
4722
		if (wait_ordered)
4723 4724
			btrfs_wait_ordered_roots(root->fs_info, items,
						 0, (u64)-1);
J
Josef Bacik 已提交
4725
		return;
4726 4727
	}

4728
	loops = 0;
J
Josef Bacik 已提交
4729 4730
	while (delalloc_bytes && loops < 3) {
		max_reclaim = min(delalloc_bytes, to_reclaim);
4731
		nr_pages = max_reclaim >> PAGE_SHIFT;
4732
		btrfs_writeback_inodes_sb_nr(root, nr_pages, items);
4733 4734 4735 4736
		/*
		 * We need to wait for the async pages to actually start before
		 * we do anything.
		 */
4737 4738 4739 4740 4741 4742 4743 4744
		max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages);
		if (!max_reclaim)
			goto skip_async;

		if (max_reclaim <= nr_pages)
			max_reclaim = 0;
		else
			max_reclaim -= nr_pages;
4745

4746 4747 4748 4749
		wait_event(root->fs_info->async_submit_wait,
			   atomic_read(&root->fs_info->async_delalloc_pages) <=
			   (int)max_reclaim);
skip_async:
M
Miao Xie 已提交
4750 4751 4752 4753
		if (!trans)
			flush = BTRFS_RESERVE_FLUSH_ALL;
		else
			flush = BTRFS_RESERVE_NO_FLUSH;
J
Josef Bacik 已提交
4754
		spin_lock(&space_info->lock);
M
Miao Xie 已提交
4755
		if (can_overcommit(root, space_info, orig, flush)) {
J
Josef Bacik 已提交
4756 4757 4758
			spin_unlock(&space_info->lock);
			break;
		}
4759 4760 4761 4762 4763
		if (list_empty(&space_info->tickets) &&
		    list_empty(&space_info->priority_tickets)) {
			spin_unlock(&space_info->lock);
			break;
		}
J
Josef Bacik 已提交
4764
		spin_unlock(&space_info->lock);
4765

4766
		loops++;
4767
		if (wait_ordered && !trans) {
4768 4769
			btrfs_wait_ordered_roots(root->fs_info, items,
						 0, (u64)-1);
4770
		} else {
J
Josef Bacik 已提交
4771
			time_left = schedule_timeout_killable(1);
4772 4773 4774
			if (time_left)
				break;
		}
4775 4776
		delalloc_bytes = percpu_counter_sum_positive(
						&root->fs_info->delalloc_bytes);
4777 4778 4779
	}
}

4780 4781 4782 4783 4784
/**
 * maybe_commit_transaction - possibly commit the transaction if its ok to
 * @root - the root we're allocating for
 * @bytes - the number of bytes we want to reserve
 * @force - force the commit
4785
 *
4786 4787 4788
 * This will check to make sure that committing the transaction will actually
 * get us somewhere and then commit the transaction if it does.  Otherwise it
 * will return -ENOSPC.
4789
 */
4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804
static int may_commit_transaction(struct btrfs_root *root,
				  struct btrfs_space_info *space_info,
				  u64 bytes, int force)
{
	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
	struct btrfs_trans_handle *trans;

	trans = (struct btrfs_trans_handle *)current->journal_info;
	if (trans)
		return -EAGAIN;

	if (force)
		goto commit;

	/* See if there is enough pinned space to make this reservation */
4805
	if (percpu_counter_compare(&space_info->total_bytes_pinned,
4806
				   bytes) >= 0)
4807 4808 4809 4810 4811 4812 4813 4814 4815 4816
		goto commit;

	/*
	 * See if there is some space in the delayed insertion reservation for
	 * this reservation.
	 */
	if (space_info != delayed_rsv->space_info)
		return -ENOSPC;

	spin_lock(&delayed_rsv->lock);
4817 4818
	if (percpu_counter_compare(&space_info->total_bytes_pinned,
				   bytes - delayed_rsv->size) >= 0) {
4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831
		spin_unlock(&delayed_rsv->lock);
		return -ENOSPC;
	}
	spin_unlock(&delayed_rsv->lock);

commit:
	trans = btrfs_join_transaction(root);
	if (IS_ERR(trans))
		return -ENOSPC;

	return btrfs_commit_transaction(trans, root);
}

4832 4833 4834 4835 4836
struct reserve_ticket {
	u64 bytes;
	int error;
	struct list_head list;
	wait_queue_head_t wait;
4837 4838 4839 4840 4841 4842 4843 4844
};

static int flush_space(struct btrfs_root *root,
		       struct btrfs_space_info *space_info, u64 num_bytes,
		       u64 orig_bytes, int state)
{
	struct btrfs_trans_handle *trans;
	int nr;
J
Josef Bacik 已提交
4845
	int ret = 0;
4846 4847 4848 4849

	switch (state) {
	case FLUSH_DELAYED_ITEMS_NR:
	case FLUSH_DELAYED_ITEMS:
4850 4851 4852
		if (state == FLUSH_DELAYED_ITEMS_NR)
			nr = calc_reclaim_items_nr(root, num_bytes) * 2;
		else
4853
			nr = -1;
4854

4855 4856 4857 4858 4859 4860 4861 4862
		trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
		ret = btrfs_run_delayed_items_nr(trans, root, nr);
		btrfs_end_transaction(trans, root);
		break;
4863 4864
	case FLUSH_DELALLOC:
	case FLUSH_DELALLOC_WAIT:
4865
		shrink_delalloc(root, num_bytes * 2, orig_bytes,
4866 4867
				state == FLUSH_DELALLOC_WAIT);
		break;
4868 4869 4870 4871 4872 4873 4874 4875 4876 4877
	case ALLOC_CHUNK:
		trans = btrfs_join_transaction(root);
		if (IS_ERR(trans)) {
			ret = PTR_ERR(trans);
			break;
		}
		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
				     btrfs_get_alloc_profile(root, 0),
				     CHUNK_ALLOC_NO_FORCE);
		btrfs_end_transaction(trans, root);
4878
		if (ret > 0 || ret == -ENOSPC)
4879 4880
			ret = 0;
		break;
4881 4882 4883 4884 4885 4886 4887 4888
	case COMMIT_TRANS:
		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
		break;
	default:
		ret = -ENOSPC;
		break;
	}

4889 4890
	trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
				orig_bytes, state, ret);
4891 4892
	return ret;
}
4893 4894 4895 4896 4897

static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
				 struct btrfs_space_info *space_info)
{
4898
	struct reserve_ticket *ticket;
4899 4900
	u64 used;
	u64 expected;
4901
	u64 to_reclaim = 0;
4902

4903 4904 4905 4906 4907 4908
	list_for_each_entry(ticket, &space_info->tickets, list)
		to_reclaim += ticket->bytes;
	list_for_each_entry(ticket, &space_info->priority_tickets, list)
		to_reclaim += ticket->bytes;
	if (to_reclaim)
		return to_reclaim;
4909

4910 4911 4912 4913 4914
	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
	if (can_overcommit(root, space_info, to_reclaim,
			   BTRFS_RESERVE_FLUSH_ALL))
		return 0;

4915 4916 4917
	used = space_info->bytes_used + space_info->bytes_reserved +
	       space_info->bytes_pinned + space_info->bytes_readonly +
	       space_info->bytes_may_use;
4918
	if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932
		expected = div_factor_fine(space_info->total_bytes, 95);
	else
		expected = div_factor_fine(space_info->total_bytes, 90);

	if (used > expected)
		to_reclaim = used - expected;
	else
		to_reclaim = 0;
	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				     space_info->bytes_reserved);
	return to_reclaim;
}

static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
4933
					struct btrfs_root *root, u64 used)
4934
{
4935 4936 4937
	u64 thresh = div_factor_fine(space_info->total_bytes, 98);

	/* If we're just plain full then async reclaim just slows us down. */
4938
	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
4939 4940
		return 0;

4941
	if (!btrfs_calc_reclaim_metadata_size(root, space_info))
4942 4943
		return 0;

4944 4945 4946
	return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
		!test_bit(BTRFS_FS_STATE_REMOUNTING,
			  &root->fs_info->fs_state));
4947 4948
}

4949
static void wake_all_tickets(struct list_head *head)
4950
{
4951
	struct reserve_ticket *ticket;
4952

4953 4954 4955 4956 4957
	while (!list_empty(head)) {
		ticket = list_first_entry(head, struct reserve_ticket, list);
		list_del_init(&ticket->list);
		ticket->error = -ENOSPC;
		wake_up(&ticket->wait);
4958 4959 4960
	}
}

4961 4962 4963 4964 4965
/*
 * This is for normal flushers, we can wait all goddamned day if we want to.  We
 * will loop and continuously try to flush as long as we are making progress.
 * We count progress as clearing off tickets each time we have to loop.
 */
4966 4967 4968 4969 4970 4971
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
	struct btrfs_fs_info *fs_info;
	struct btrfs_space_info *space_info;
	u64 to_reclaim;
	int flush_state;
4972
	int commit_cycles = 0;
4973
	u64 last_tickets_id;
4974 4975 4976 4977

	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);

4978
	spin_lock(&space_info->lock);
4979 4980
	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
						      space_info);
4981 4982 4983
	if (!to_reclaim) {
		space_info->flush = 0;
		spin_unlock(&space_info->lock);
4984
		return;
4985
	}
4986
	last_tickets_id = space_info->tickets_id;
4987
	spin_unlock(&space_info->lock);
4988 4989

	flush_state = FLUSH_DELAYED_ITEMS_NR;
4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005
	do {
		struct reserve_ticket *ticket;
		int ret;

		ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
			    to_reclaim, flush_state);
		spin_lock(&space_info->lock);
		if (list_empty(&space_info->tickets)) {
			space_info->flush = 0;
			spin_unlock(&space_info->lock);
			return;
		}
		to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
							      space_info);
		ticket = list_first_entry(&space_info->tickets,
					  struct reserve_ticket, list);
5006
		if (last_tickets_id == space_info->tickets_id) {
5007 5008
			flush_state++;
		} else {
5009
			last_tickets_id = space_info->tickets_id;
5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048
			flush_state = FLUSH_DELAYED_ITEMS_NR;
			if (commit_cycles)
				commit_cycles--;
		}

		if (flush_state > COMMIT_TRANS) {
			commit_cycles++;
			if (commit_cycles > 2) {
				wake_all_tickets(&space_info->tickets);
				space_info->flush = 0;
			} else {
				flush_state = FLUSH_DELAYED_ITEMS_NR;
			}
		}
		spin_unlock(&space_info->lock);
	} while (flush_state <= COMMIT_TRANS);
}

void btrfs_init_async_reclaim_work(struct work_struct *work)
{
	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
}

static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
					    struct btrfs_space_info *space_info,
					    struct reserve_ticket *ticket)
{
	u64 to_reclaim;
	int flush_state = FLUSH_DELAYED_ITEMS_NR;

	spin_lock(&space_info->lock);
	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
						      space_info);
	if (!to_reclaim) {
		spin_unlock(&space_info->lock);
		return;
	}
	spin_unlock(&space_info->lock);

5049 5050 5051 5052
	do {
		flush_space(fs_info->fs_root, space_info, to_reclaim,
			    to_reclaim, flush_state);
		flush_state++;
5053 5054 5055
		spin_lock(&space_info->lock);
		if (ticket->bytes == 0) {
			spin_unlock(&space_info->lock);
5056
			return;
5057 5058 5059 5060 5061 5062 5063 5064 5065 5066
		}
		spin_unlock(&space_info->lock);

		/*
		 * Priority flushers can't wait on delalloc without
		 * deadlocking.
		 */
		if (flush_state == FLUSH_DELALLOC ||
		    flush_state == FLUSH_DELALLOC_WAIT)
			flush_state = ALLOC_CHUNK;
5067
	} while (flush_state < COMMIT_TRANS);
5068 5069
}

5070 5071 5072 5073
static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
			       struct btrfs_space_info *space_info,
			       struct reserve_ticket *ticket, u64 orig_bytes)

5074
{
5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104
	DEFINE_WAIT(wait);
	int ret = 0;

	spin_lock(&space_info->lock);
	while (ticket->bytes > 0 && ticket->error == 0) {
		ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
		if (ret) {
			ret = -EINTR;
			break;
		}
		spin_unlock(&space_info->lock);

		schedule();

		finish_wait(&ticket->wait, &wait);
		spin_lock(&space_info->lock);
	}
	if (!ret)
		ret = ticket->error;
	if (!list_empty(&ticket->list))
		list_del_init(&ticket->list);
	if (ticket->bytes && ticket->bytes < orig_bytes) {
		u64 num_bytes = orig_bytes - ticket->bytes;
		space_info->bytes_may_use -= num_bytes;
		trace_btrfs_space_reservation(fs_info, "space_info",
					      space_info->flags, num_bytes, 0);
	}
	spin_unlock(&space_info->lock);

	return ret;
5105 5106
}

5107 5108 5109
/**
 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 * @root - the root we're allocating for
5110
 * @space_info - the space info we want to allocate from
5111
 * @orig_bytes - the number of bytes we want
5112
 * @flush - whether or not we can flush to make our reservation
5113
 *
5114
 * This will reserve orig_bytes number of bytes from the space info associated
5115 5116 5117 5118 5119
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
5120
 */
5121 5122 5123 5124
static int __reserve_metadata_bytes(struct btrfs_root *root,
				    struct btrfs_space_info *space_info,
				    u64 orig_bytes,
				    enum btrfs_reserve_flush_enum flush)
J
Josef Bacik 已提交
5125
{
5126
	struct reserve_ticket ticket;
5127
	u64 used;
5128
	int ret = 0;
J
Josef Bacik 已提交
5129

5130
	ASSERT(orig_bytes);
5131
	ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
5132

5133
	spin_lock(&space_info->lock);
5134
	ret = -ENOSPC;
5135 5136 5137
	used = space_info->bytes_used + space_info->bytes_reserved +
		space_info->bytes_pinned + space_info->bytes_readonly +
		space_info->bytes_may_use;
J
Josef Bacik 已提交
5138

5139
	/*
5140 5141 5142
	 * If we have enough space then hooray, make our reservation and carry
	 * on.  If not see if we can overcommit, and if we can, hooray carry on.
	 * If not things get more complicated.
5143
	 */
5144 5145 5146 5147 5148 5149 5150
	if (used + orig_bytes <= space_info->total_bytes) {
		space_info->bytes_may_use += orig_bytes;
		trace_btrfs_space_reservation(root->fs_info, "space_info",
					      space_info->flags, orig_bytes,
					      1);
		ret = 0;
	} else if (can_overcommit(root, space_info, orig_bytes, flush)) {
5151 5152 5153 5154 5155
		space_info->bytes_may_use += orig_bytes;
		trace_btrfs_space_reservation(root->fs_info, "space_info",
					      space_info->flags, orig_bytes,
					      1);
		ret = 0;
5156 5157
	}

5158
	/*
5159 5160
	 * If we couldn't make a reservation then setup our reservation ticket
	 * and kick the async worker if it's not already running.
M
Miao Xie 已提交
5161
	 *
5162 5163
	 * If we are a priority flusher then we just need to add our ticket to
	 * the list and we will do our own flushing further down.
5164
	 */
5165
	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
5166 5167 5168 5169 5170 5171 5172
		ticket.bytes = orig_bytes;
		ticket.error = 0;
		init_waitqueue_head(&ticket.wait);
		if (flush == BTRFS_RESERVE_FLUSH_ALL) {
			list_add_tail(&ticket.list, &space_info->tickets);
			if (!space_info->flush) {
				space_info->flush = 1;
5173 5174 5175 5176
				trace_btrfs_trigger_flush(root->fs_info,
							  space_info->flags,
							  orig_bytes, flush,
							  "enospc");
5177 5178 5179 5180 5181 5182 5183
				queue_work(system_unbound_wq,
					   &root->fs_info->async_reclaim_work);
			}
		} else {
			list_add_tail(&ticket.list,
				      &space_info->priority_tickets);
		}
5184 5185
	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
		used += orig_bytes;
5186 5187 5188 5189 5190 5191
		/*
		 * We will do the space reservation dance during log replay,
		 * which means we won't have fs_info->fs_root set, so don't do
		 * the async reclaim as we will panic.
		 */
		if (!root->fs_info->log_root_recovering &&
5192
		    need_do_async_reclaim(space_info, root, used) &&
5193 5194 5195 5196 5197
		    !work_busy(&root->fs_info->async_reclaim_work)) {
			trace_btrfs_trigger_flush(root->fs_info,
						  space_info->flags,
						  orig_bytes, flush,
						  "preempt");
5198 5199
			queue_work(system_unbound_wq,
				   &root->fs_info->async_reclaim_work);
5200
		}
5201
	}
5202
	spin_unlock(&space_info->lock);
M
Miao Xie 已提交
5203
	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
5204
		return ret;
5205

5206 5207 5208
	if (flush == BTRFS_RESERVE_FLUSH_ALL)
		return wait_reserve_ticket(root->fs_info, space_info, &ticket,
					   orig_bytes);
M
Miao Xie 已提交
5209

5210 5211 5212 5213 5214 5215 5216 5217 5218 5219
	ret = 0;
	priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
	spin_lock(&space_info->lock);
	if (ticket.bytes) {
		if (ticket.bytes < orig_bytes) {
			u64 num_bytes = orig_bytes - ticket.bytes;
			space_info->bytes_may_use -= num_bytes;
			trace_btrfs_space_reservation(root->fs_info,
					"space_info", space_info->flags,
					num_bytes, 0);
M
Miao Xie 已提交
5220

5221 5222 5223 5224 5225 5226 5227 5228
		}
		list_del_init(&ticket.list);
		ret = -ENOSPC;
	}
	spin_unlock(&space_info->lock);
	ASSERT(list_empty(&ticket.list));
	return ret;
}
5229

5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252
/**
 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
 * @root - the root we're allocating for
 * @block_rsv - the block_rsv we're allocating for
 * @orig_bytes - the number of bytes we want
 * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orgi_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
 * flush out space to make room.  It will do this by flushing delalloc if
 * possible or committing the transaction.  If flush is 0 then no attempts to
 * regain reservations will be made and this will fail if there is not enough
 * space already.
 */
static int reserve_metadata_bytes(struct btrfs_root *root,
				  struct btrfs_block_rsv *block_rsv,
				  u64 orig_bytes,
				  enum btrfs_reserve_flush_enum flush)
{
	int ret;

	ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
				       flush);
5253 5254 5255 5256 5257 5258 5259 5260 5261
	if (ret == -ENOSPC &&
	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
		struct btrfs_block_rsv *global_rsv =
			&root->fs_info->global_block_rsv;

		if (block_rsv != global_rsv &&
		    !block_rsv_use_bytes(global_rsv, orig_bytes))
			ret = 0;
	}
5262 5263 5264
	if (ret == -ENOSPC)
		trace_btrfs_space_reservation(root->fs_info,
					      "space_info:enospc",
5265 5266
					      block_rsv->space_info->flags,
					      orig_bytes, 1);
5267 5268 5269
	return ret;
}

5270 5271 5272
static struct btrfs_block_rsv *get_block_rsv(
					const struct btrfs_trans_handle *trans,
					const struct btrfs_root *root)
5273
{
5274 5275
	struct btrfs_block_rsv *block_rsv = NULL;

5276 5277 5278
	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
	    (root == root->fs_info->csum_root && trans->adding_csums) ||
	     (root == root->fs_info->uuid_root))
5279 5280
		block_rsv = trans->block_rsv;

5281
	if (!block_rsv)
5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316
		block_rsv = root->block_rsv;

	if (!block_rsv)
		block_rsv = &root->fs_info->empty_block_rsv;

	return block_rsv;
}

static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
			       u64 num_bytes)
{
	int ret = -ENOSPC;
	spin_lock(&block_rsv->lock);
	if (block_rsv->reserved >= num_bytes) {
		block_rsv->reserved -= num_bytes;
		if (block_rsv->reserved < block_rsv->size)
			block_rsv->full = 0;
		ret = 0;
	}
	spin_unlock(&block_rsv->lock);
	return ret;
}

static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
				u64 num_bytes, int update_size)
{
	spin_lock(&block_rsv->lock);
	block_rsv->reserved += num_bytes;
	if (update_size)
		block_rsv->size += num_bytes;
	else if (block_rsv->reserved >= block_rsv->size)
		block_rsv->full = 1;
	spin_unlock(&block_rsv->lock);
}

5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
			     struct btrfs_block_rsv *dest, u64 num_bytes,
			     int min_factor)
{
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
	u64 min_bytes;

	if (global_rsv->space_info != dest->space_info)
		return -ENOSPC;

	spin_lock(&global_rsv->lock);
	min_bytes = div_factor(global_rsv->size, min_factor);
	if (global_rsv->reserved < min_bytes + num_bytes) {
		spin_unlock(&global_rsv->lock);
		return -ENOSPC;
	}
	global_rsv->reserved -= num_bytes;
	if (global_rsv->reserved < global_rsv->size)
		global_rsv->full = 0;
	spin_unlock(&global_rsv->lock);

	block_rsv_add_bytes(dest, num_bytes, 1);
	return 0;
}

5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384
/*
 * This is for space we already have accounted in space_info->bytes_may_use, so
 * basically when we're returning space from block_rsv's.
 */
static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				     struct btrfs_space_info *space_info,
				     u64 num_bytes)
{
	struct reserve_ticket *ticket;
	struct list_head *head;
	u64 used;
	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
	bool check_overcommit = false;

	spin_lock(&space_info->lock);
	head = &space_info->priority_tickets;

	/*
	 * If we are over our limit then we need to check and see if we can
	 * overcommit, and if we can't then we just need to free up our space
	 * and not satisfy any requests.
	 */
	used = space_info->bytes_used + space_info->bytes_reserved +
		space_info->bytes_pinned + space_info->bytes_readonly +
		space_info->bytes_may_use;
	if (used - num_bytes >= space_info->total_bytes)
		check_overcommit = true;
again:
	while (!list_empty(head) && num_bytes) {
		ticket = list_first_entry(head, struct reserve_ticket,
					  list);
		/*
		 * We use 0 bytes because this space is already reserved, so
		 * adding the ticket space would be a double count.
		 */
		if (check_overcommit &&
		    !can_overcommit(fs_info->extent_root, space_info, 0,
				    flush))
			break;
		if (num_bytes >= ticket->bytes) {
			list_del_init(&ticket->list);
			num_bytes -= ticket->bytes;
			ticket->bytes = 0;
5385
			space_info->tickets_id++;
5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427
			wake_up(&ticket->wait);
		} else {
			ticket->bytes -= num_bytes;
			num_bytes = 0;
		}
	}

	if (num_bytes && head == &space_info->priority_tickets) {
		head = &space_info->tickets;
		flush = BTRFS_RESERVE_FLUSH_ALL;
		goto again;
	}
	space_info->bytes_may_use -= num_bytes;
	trace_btrfs_space_reservation(fs_info, "space_info",
				      space_info->flags, num_bytes, 0);
	spin_unlock(&space_info->lock);
}

/*
 * This is for newly allocated space that isn't accounted in
 * space_info->bytes_may_use yet.  So if we allocate a chunk or unpin an extent
 * we use this helper.
 */
static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				     struct btrfs_space_info *space_info,
				     u64 num_bytes)
{
	struct reserve_ticket *ticket;
	struct list_head *head = &space_info->priority_tickets;

again:
	while (!list_empty(head) && num_bytes) {
		ticket = list_first_entry(head, struct reserve_ticket,
					  list);
		if (num_bytes >= ticket->bytes) {
			trace_btrfs_space_reservation(fs_info, "space_info",
						      space_info->flags,
						      ticket->bytes, 1);
			list_del_init(&ticket->list);
			num_bytes -= ticket->bytes;
			space_info->bytes_may_use += ticket->bytes;
			ticket->bytes = 0;
5428
			space_info->tickets_id++;
5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445
			wake_up(&ticket->wait);
		} else {
			trace_btrfs_space_reservation(fs_info, "space_info",
						      space_info->flags,
						      num_bytes, 1);
			space_info->bytes_may_use += num_bytes;
			ticket->bytes -= num_bytes;
			num_bytes = 0;
		}
	}

	if (num_bytes && head == &space_info->priority_tickets) {
		head = &space_info->tickets;
		goto again;
	}
}

J
Josef Bacik 已提交
5446 5447
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
				    struct btrfs_block_rsv *block_rsv,
5448
				    struct btrfs_block_rsv *dest, u64 num_bytes)
5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466
{
	struct btrfs_space_info *space_info = block_rsv->space_info;

	spin_lock(&block_rsv->lock);
	if (num_bytes == (u64)-1)
		num_bytes = block_rsv->size;
	block_rsv->size -= num_bytes;
	if (block_rsv->reserved >= block_rsv->size) {
		num_bytes = block_rsv->reserved - block_rsv->size;
		block_rsv->reserved = block_rsv->size;
		block_rsv->full = 1;
	} else {
		num_bytes = 0;
	}
	spin_unlock(&block_rsv->lock);

	if (num_bytes > 0) {
		if (dest) {
5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479
			spin_lock(&dest->lock);
			if (!dest->full) {
				u64 bytes_to_add;

				bytes_to_add = dest->size - dest->reserved;
				bytes_to_add = min(num_bytes, bytes_to_add);
				dest->reserved += bytes_to_add;
				if (dest->reserved >= dest->size)
					dest->full = 1;
				num_bytes -= bytes_to_add;
			}
			spin_unlock(&dest->lock);
		}
5480 5481 5482
		if (num_bytes)
			space_info_add_old_bytes(fs_info, space_info,
						 num_bytes);
J
Josef Bacik 已提交
5483
	}
5484
}
5485

5486 5487 5488
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
			    struct btrfs_block_rsv *dst, u64 num_bytes,
			    int update_size)
5489 5490
{
	int ret;
J
Josef Bacik 已提交
5491

5492 5493 5494
	ret = block_rsv_use_bytes(src, num_bytes);
	if (ret)
		return ret;
J
Josef Bacik 已提交
5495

5496
	block_rsv_add_bytes(dst, num_bytes, update_size);
J
Josef Bacik 已提交
5497 5498 5499
	return 0;
}

5500
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
J
Josef Bacik 已提交
5501
{
5502 5503
	memset(rsv, 0, sizeof(*rsv));
	spin_lock_init(&rsv->lock);
5504
	rsv->type = type;
5505 5506
}

5507 5508
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
					      unsigned short type)
5509 5510 5511
{
	struct btrfs_block_rsv *block_rsv;
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Josef Bacik 已提交
5512

5513 5514 5515
	block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
	if (!block_rsv)
		return NULL;
J
Josef Bacik 已提交
5516

5517
	btrfs_init_block_rsv(block_rsv, type);
5518 5519 5520 5521
	block_rsv->space_info = __find_space_info(fs_info,
						  BTRFS_BLOCK_GROUP_METADATA);
	return block_rsv;
}
J
Josef Bacik 已提交
5522

5523 5524 5525
void btrfs_free_block_rsv(struct btrfs_root *root,
			  struct btrfs_block_rsv *rsv)
{
J
Josef Bacik 已提交
5526 5527
	if (!rsv)
		return;
5528 5529
	btrfs_block_rsv_release(root, rsv, (u64)-1);
	kfree(rsv);
J
Josef Bacik 已提交
5530 5531
}

5532 5533 5534 5535 5536
void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv)
{
	kfree(rsv);
}

M
Miao Xie 已提交
5537 5538 5539
int btrfs_block_rsv_add(struct btrfs_root *root,
			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
			enum btrfs_reserve_flush_enum flush)
J
Josef Bacik 已提交
5540
{
5541
	int ret;
J
Josef Bacik 已提交
5542

5543 5544
	if (num_bytes == 0)
		return 0;
5545

5546
	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5547 5548 5549 5550
	if (!ret) {
		block_rsv_add_bytes(block_rsv, num_bytes, 1);
		return 0;
	}
J
Josef Bacik 已提交
5551

5552 5553
	return ret;
}
J
Josef Bacik 已提交
5554

5555
int btrfs_block_rsv_check(struct btrfs_root *root,
5556
			  struct btrfs_block_rsv *block_rsv, int min_factor)
5557 5558 5559
{
	u64 num_bytes = 0;
	int ret = -ENOSPC;
J
Josef Bacik 已提交
5560

5561 5562
	if (!block_rsv)
		return 0;
J
Josef Bacik 已提交
5563

5564
	spin_lock(&block_rsv->lock);
5565 5566 5567 5568
	num_bytes = div_factor(block_rsv->size, min_factor);
	if (block_rsv->reserved >= num_bytes)
		ret = 0;
	spin_unlock(&block_rsv->lock);
J
Josef Bacik 已提交
5569

5570 5571 5572
	return ret;
}

M
Miao Xie 已提交
5573 5574 5575
int btrfs_block_rsv_refill(struct btrfs_root *root,
			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
			   enum btrfs_reserve_flush_enum flush)
5576 5577 5578 5579 5580 5581 5582 5583 5584
{
	u64 num_bytes = 0;
	int ret = -ENOSPC;

	if (!block_rsv)
		return 0;

	spin_lock(&block_rsv->lock);
	num_bytes = min_reserved;
5585
	if (block_rsv->reserved >= num_bytes)
5586
		ret = 0;
5587
	else
5588 5589
		num_bytes -= block_rsv->reserved;
	spin_unlock(&block_rsv->lock);
5590

5591 5592 5593
	if (!ret)
		return 0;

5594
	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
5595 5596
	if (!ret) {
		block_rsv_add_bytes(block_rsv, num_bytes, 0);
5597
		return 0;
J
Josef Bacik 已提交
5598
	}
J
Josef Bacik 已提交
5599

5600
	return ret;
5601 5602 5603 5604 5605 5606 5607
}

void btrfs_block_rsv_release(struct btrfs_root *root,
			     struct btrfs_block_rsv *block_rsv,
			     u64 num_bytes)
{
	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5608
	if (global_rsv == block_rsv ||
5609 5610
	    block_rsv->space_info != global_rsv->space_info)
		global_rsv = NULL;
J
Josef Bacik 已提交
5611 5612
	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
				num_bytes);
J
Josef Bacik 已提交
5613 5614
}

5615 5616 5617 5618 5619
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
	struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
	struct btrfs_space_info *sinfo = block_rsv->space_info;
	u64 num_bytes;
J
Josef Bacik 已提交
5620

5621 5622 5623 5624 5625 5626 5627 5628 5629
	/*
	 * The global block rsv is based on the size of the extent tree, the
	 * checksum tree and the root tree.  If the fs is empty we want to set
	 * it to a minimal amount for safety.
	 */
	num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
		btrfs_root_used(&fs_info->csum_root->root_item) +
		btrfs_root_used(&fs_info->tree_root->root_item);
	num_bytes = max_t(u64, num_bytes, SZ_16M);
C
Chris Mason 已提交
5630

5631
	spin_lock(&sinfo->lock);
5632
	spin_lock(&block_rsv->lock);
5633

5634
	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
5635

5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650
	if (block_rsv->reserved < block_rsv->size) {
		num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
			sinfo->bytes_reserved + sinfo->bytes_readonly +
			sinfo->bytes_may_use;
		if (sinfo->total_bytes > num_bytes) {
			num_bytes = sinfo->total_bytes - num_bytes;
			num_bytes = min(num_bytes,
					block_rsv->size - block_rsv->reserved);
			block_rsv->reserved += num_bytes;
			sinfo->bytes_may_use += num_bytes;
			trace_btrfs_space_reservation(fs_info, "space_info",
						      sinfo->flags, num_bytes,
						      1);
		}
	} else if (block_rsv->reserved > block_rsv->size) {
5651
		num_bytes = block_rsv->reserved - block_rsv->size;
5652
		sinfo->bytes_may_use -= num_bytes;
J
Josef Bacik 已提交
5653
		trace_btrfs_space_reservation(fs_info, "space_info",
5654
				      sinfo->flags, num_bytes, 0);
5655 5656
		block_rsv->reserved = block_rsv->size;
	}
5657

5658 5659 5660 5661 5662
	if (block_rsv->reserved == block_rsv->size)
		block_rsv->full = 1;
	else
		block_rsv->full = 0;

5663
	spin_unlock(&block_rsv->lock);
5664
	spin_unlock(&sinfo->lock);
J
Josef Bacik 已提交
5665 5666
}

5667
static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
J
Josef Bacik 已提交
5668
{
5669
	struct btrfs_space_info *space_info;
J
Josef Bacik 已提交
5670

5671 5672
	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
	fs_info->chunk_block_rsv.space_info = space_info;
J
Josef Bacik 已提交
5673

5674
	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
5675 5676
	fs_info->global_block_rsv.space_info = space_info;
	fs_info->delalloc_block_rsv.space_info = space_info;
5677 5678
	fs_info->trans_block_rsv.space_info = space_info;
	fs_info->empty_block_rsv.space_info = space_info;
5679
	fs_info->delayed_block_rsv.space_info = space_info;
5680

5681 5682 5683 5684
	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
5685 5686
	if (fs_info->quota_root)
		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
5687
	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
5688 5689

	update_global_block_rsv(fs_info);
J
Josef Bacik 已提交
5690 5691
}

5692
static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
J
Josef Bacik 已提交
5693
{
J
Josef Bacik 已提交
5694 5695
	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
				(u64)-1);
5696 5697 5698 5699 5700 5701
	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
	WARN_ON(fs_info->trans_block_rsv.size > 0);
	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
	WARN_ON(fs_info->chunk_block_rsv.size > 0);
	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
5702 5703
	WARN_ON(fs_info->delayed_block_rsv.size > 0);
	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
5704 5705
}

5706 5707
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root)
J
Josef Bacik 已提交
5708
{
5709 5710 5711
	if (!trans->block_rsv)
		return;

5712 5713
	if (!trans->bytes_reserved)
		return;
J
Josef Bacik 已提交
5714

5715
	trace_btrfs_space_reservation(root->fs_info, "transaction",
5716
				      trans->transid, trans->bytes_reserved, 0);
5717
	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
5718 5719
	trans->bytes_reserved = 0;
}
J
Josef Bacik 已提交
5720

5721 5722 5723 5724 5725 5726
/*
 * To be called after all the new block groups attached to the transaction
 * handle have been created (btrfs_create_pending_block_groups()).
 */
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
5727
	struct btrfs_fs_info *fs_info = trans->fs_info;
5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738

	if (!trans->chunk_bytes_reserved)
		return;

	WARN_ON_ONCE(!list_empty(&trans->new_bgs));

	block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
				trans->chunk_bytes_reserved);
	trans->chunk_bytes_reserved = 0;
}

5739
/* Can only return 0 or -ENOSPC */
5740 5741 5742 5743
int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
				  struct inode *inode)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
5744 5745 5746 5747 5748 5749 5750
	/*
	 * We always use trans->block_rsv here as we will have reserved space
	 * for our orphan when starting the transaction, using get_block_rsv()
	 * here will sometimes make us choose the wrong block rsv as we could be
	 * doing a reloc inode for a non refcounted root.
	 */
	struct btrfs_block_rsv *src_rsv = trans->block_rsv;
5751 5752 5753
	struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;

	/*
5754 5755 5756
	 * We need to hold space in order to delete our orphan item once we've
	 * added it, so this takes the reservation so we can release it later
	 * when we are truly done with the orphan item.
5757
	 */
C
Chris Mason 已提交
5758
	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
J
Josef Bacik 已提交
5759 5760
	trace_btrfs_space_reservation(root->fs_info, "orphan",
				      btrfs_ino(inode), num_bytes, 1);
5761
	return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
J
Josef Bacik 已提交
5762 5763
}

5764
void btrfs_orphan_release_metadata(struct inode *inode)
5765
{
5766
	struct btrfs_root *root = BTRFS_I(inode)->root;
C
Chris Mason 已提交
5767
	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
J
Josef Bacik 已提交
5768 5769
	trace_btrfs_space_reservation(root->fs_info, "orphan",
				      btrfs_ino(inode), num_bytes, 0);
5770 5771
	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
}
5772

5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784
/*
 * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
 * root: the root of the parent directory
 * rsv: block reservation
 * items: the number of items that we need do reservation
 * qgroup_reserved: used to return the reserved size in qgroup
 *
 * This function is used to reserve the space for snapshot/subvolume
 * creation and deletion. Those operations are different with the
 * common file/directory operations, they change two fs/file trees
 * and root tree, the number of items that the qgroup reserves is
 * different with the free space reservation. So we can not use
5785
 * the space reservation mechanism in start_transaction().
5786 5787 5788 5789
 */
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
				     struct btrfs_block_rsv *rsv,
				     int items,
5790 5791
				     u64 *qgroup_reserved,
				     bool use_global_rsv)
5792
{
5793 5794
	u64 num_bytes;
	int ret;
5795
	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5796 5797 5798

	if (root->fs_info->quota_enabled) {
		/* One for parent inode, two for dir entries */
5799
		num_bytes = 3 * root->nodesize;
5800
		ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813
		if (ret)
			return ret;
	} else {
		num_bytes = 0;
	}

	*qgroup_reserved = num_bytes;

	num_bytes = btrfs_calc_trans_metadata_size(root, items);
	rsv->space_info = __find_space_info(root->fs_info,
					    BTRFS_BLOCK_GROUP_METADATA);
	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
				  BTRFS_RESERVE_FLUSH_ALL);
5814 5815

	if (ret == -ENOSPC && use_global_rsv)
5816
		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
5817

5818 5819
	if (ret && *qgroup_reserved)
		btrfs_qgroup_free_meta(root, *qgroup_reserved);
5820 5821 5822 5823 5824 5825 5826 5827 5828

	return ret;
}

void btrfs_subvolume_release_metadata(struct btrfs_root *root,
				      struct btrfs_block_rsv *rsv,
				      u64 qgroup_reserved)
{
	btrfs_block_rsv_release(root, rsv, (u64)-1);
5829 5830
}

5831 5832 5833
/**
 * drop_outstanding_extent - drop an outstanding extent
 * @inode: the inode we're dropping the extent for
5834
 * @num_bytes: the number of bytes we're releasing.
5835 5836 5837 5838 5839 5840
 *
 * This is called when we are freeing up an outstanding extent, either called
 * after an error or after an extent is written.  This will return the number of
 * reserved extents that need to be freed.  This must be called with
 * BTRFS_I(inode)->lock held.
 */
5841
static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
5842
{
5843
	unsigned drop_inode_space = 0;
5844
	unsigned dropped_extents = 0;
5845
	unsigned num_extents = 0;
5846

5847 5848 5849 5850 5851 5852
	num_extents = (unsigned)div64_u64(num_bytes +
					  BTRFS_MAX_EXTENT_SIZE - 1,
					  BTRFS_MAX_EXTENT_SIZE);
	ASSERT(num_extents);
	ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
	BTRFS_I(inode)->outstanding_extents -= num_extents;
5853

5854
	if (BTRFS_I(inode)->outstanding_extents == 0 &&
5855 5856
	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
			       &BTRFS_I(inode)->runtime_flags))
5857 5858
		drop_inode_space = 1;

5859
	/*
5860
	 * If we have more or the same amount of outstanding extents than we have
5861 5862 5863 5864
	 * reserved then we need to leave the reserved extents count alone.
	 */
	if (BTRFS_I(inode)->outstanding_extents >=
	    BTRFS_I(inode)->reserved_extents)
5865
		return drop_inode_space;
5866 5867 5868 5869

	dropped_extents = BTRFS_I(inode)->reserved_extents -
		BTRFS_I(inode)->outstanding_extents;
	BTRFS_I(inode)->reserved_extents -= dropped_extents;
5870
	return dropped_extents + drop_inode_space;
5871 5872
}

5873
/**
5874 5875
 * calc_csum_metadata_size - return the amount of metadata space that must be
 *	reserved/freed for the given bytes.
5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892
 * @inode: the inode we're manipulating
 * @num_bytes: the number of bytes in question
 * @reserve: 1 if we are reserving space, 0 if we are freeing space
 *
 * This adjusts the number of csum_bytes in the inode and then returns the
 * correct amount of metadata that must either be reserved or freed.  We
 * calculate how many checksums we can fit into one leaf and then divide the
 * number of bytes that will need to be checksumed by this value to figure out
 * how many checksums will be required.  If we are adding bytes then the number
 * may go up and we will return the number of additional bytes that must be
 * reserved.  If it is going down we will return the number of bytes that must
 * be freed.
 *
 * This must be called with BTRFS_I(inode)->lock held.
 */
static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
				   int reserve)
5893
{
5894
	struct btrfs_root *root = BTRFS_I(inode)->root;
5895
	u64 old_csums, num_csums;
5896 5897 5898 5899 5900

	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
	    BTRFS_I(inode)->csum_bytes == 0)
		return 0;

5901
	old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5902 5903 5904 5905
	if (reserve)
		BTRFS_I(inode)->csum_bytes += num_bytes;
	else
		BTRFS_I(inode)->csum_bytes -= num_bytes;
5906
	num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes);
5907 5908 5909 5910 5911 5912 5913 5914 5915 5916

	/* No change, no need to reserve more */
	if (old_csums == num_csums)
		return 0;

	if (reserve)
		return btrfs_calc_trans_metadata_size(root,
						      num_csums - old_csums);

	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
5917
}
Y
Yan Zheng 已提交
5918

5919 5920 5921 5922
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
5923
	u64 to_reserve = 0;
5924
	u64 csum_bytes;
5925
	unsigned nr_extents = 0;
M
Miao Xie 已提交
5926
	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
5927
	int ret = 0;
5928
	bool delalloc_lock = true;
5929 5930
	u64 to_free = 0;
	unsigned dropped;
5931
	bool release_extra = false;
5932

5933 5934 5935 5936
	/* If we are a free space inode we need to not flush since we will be in
	 * the middle of a transaction commit.  We also don't need the delalloc
	 * mutex since we won't race with anybody.  We need this mostly to make
	 * lockdep shut its filthy mouth.
5937 5938 5939
	 *
	 * If we have a transaction open (can happen if we call truncate_block
	 * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
5940 5941
	 */
	if (btrfs_is_free_space_inode(inode)) {
M
Miao Xie 已提交
5942
		flush = BTRFS_RESERVE_NO_FLUSH;
5943
		delalloc_lock = false;
5944 5945
	} else if (current->journal_info) {
		flush = BTRFS_RESERVE_FLUSH_LIMIT;
5946
	}
5947

M
Miao Xie 已提交
5948 5949
	if (flush != BTRFS_RESERVE_NO_FLUSH &&
	    btrfs_transaction_in_commit(root->fs_info))
5950
		schedule_timeout(1);
5951

5952 5953 5954
	if (delalloc_lock)
		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);

5955
	num_bytes = ALIGN(num_bytes, root->sectorsize);
5956

5957
	spin_lock(&BTRFS_I(inode)->lock);
5958 5959 5960 5961
	nr_extents = (unsigned)div64_u64(num_bytes +
					 BTRFS_MAX_EXTENT_SIZE - 1,
					 BTRFS_MAX_EXTENT_SIZE);
	BTRFS_I(inode)->outstanding_extents += nr_extents;
5962

5963
	nr_extents = 0;
5964
	if (BTRFS_I(inode)->outstanding_extents >
5965
	    BTRFS_I(inode)->reserved_extents)
5966
		nr_extents += BTRFS_I(inode)->outstanding_extents -
5967
			BTRFS_I(inode)->reserved_extents;
5968

5969 5970
	/* We always want to reserve a slot for updating the inode. */
	to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
5971
	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
5972
	csum_bytes = BTRFS_I(inode)->csum_bytes;
5973
	spin_unlock(&BTRFS_I(inode)->lock);
5974

5975
	if (root->fs_info->quota_enabled) {
5976 5977
		ret = btrfs_qgroup_reserve_meta(root,
				nr_extents * root->nodesize);
5978 5979 5980
		if (ret)
			goto out_fail;
	}
5981

5982
	ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
5983
	if (unlikely(ret)) {
5984
		btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5985
		goto out_fail;
5986
	}
5987

5988
	spin_lock(&BTRFS_I(inode)->lock);
5989
	if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
5990 5991
			     &BTRFS_I(inode)->runtime_flags)) {
		to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
5992
		release_extra = true;
5993 5994 5995
	}
	BTRFS_I(inode)->reserved_extents += nr_extents;
	spin_unlock(&BTRFS_I(inode)->lock);
5996 5997 5998

	if (delalloc_lock)
		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
5999

J
Josef Bacik 已提交
6000
	if (to_reserve)
6001
		trace_btrfs_space_reservation(root->fs_info, "delalloc",
J
Josef Bacik 已提交
6002
					      btrfs_ino(inode), to_reserve, 1);
6003 6004 6005 6006
	if (release_extra)
		btrfs_block_rsv_release(root, block_rsv,
					btrfs_calc_trans_metadata_size(root,
								       1));
6007
	return 0;
6008 6009 6010

out_fail:
	spin_lock(&BTRFS_I(inode)->lock);
6011
	dropped = drop_outstanding_extent(inode, num_bytes);
6012 6013 6014 6015 6016
	/*
	 * If the inodes csum_bytes is the same as the original
	 * csum_bytes then we know we haven't raced with any free()ers
	 * so we can just reduce our inodes csum bytes and carry on.
	 */
6017
	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
6018
		calc_csum_metadata_size(inode, num_bytes, 0);
6019 6020 6021 6022 6023 6024
	} else {
		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
		u64 bytes;

		/*
		 * This is tricky, but first we need to figure out how much we
6025
		 * freed from any free-ers that occurred during this
6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046
		 * reservation, so we reset ->csum_bytes to the csum_bytes
		 * before we dropped our lock, and then call the free for the
		 * number of bytes that were freed while we were trying our
		 * reservation.
		 */
		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
		BTRFS_I(inode)->csum_bytes = csum_bytes;
		to_free = calc_csum_metadata_size(inode, bytes, 0);


		/*
		 * Now we need to see how much we would have freed had we not
		 * been making this reservation and our ->csum_bytes were not
		 * artificially inflated.
		 */
		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
		bytes = csum_bytes - orig_csum_bytes;
		bytes = calc_csum_metadata_size(inode, bytes, 0);

		/*
		 * Now reset ->csum_bytes to what it should be.  If bytes is
6047
		 * more than to_free then we would have freed more space had we
6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058
		 * not had an artificially high ->csum_bytes, so we need to free
		 * the remainder.  If bytes is the same or less then we don't
		 * need to do anything, the other free-ers did the correct
		 * thing.
		 */
		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
		if (bytes > to_free)
			to_free = bytes - to_free;
		else
			to_free = 0;
	}
6059
	spin_unlock(&BTRFS_I(inode)->lock);
6060
	if (dropped)
6061 6062 6063 6064 6065 6066 6067 6068 6069 6070
		to_free += btrfs_calc_trans_metadata_size(root, dropped);

	if (to_free) {
		btrfs_block_rsv_release(root, block_rsv, to_free);
		trace_btrfs_space_reservation(root->fs_info, "delalloc",
					      btrfs_ino(inode), to_free, 0);
	}
	if (delalloc_lock)
		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
	return ret;
6071 6072
}

6073 6074 6075 6076 6077 6078 6079 6080 6081
/**
 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
 * @inode: the inode to release the reservation for
 * @num_bytes: the number of bytes we're releasing
 *
 * This will release the metadata reservation for an inode.  This can be called
 * once we complete IO for a given set of bytes to release their metadata
 * reservations.
 */
6082 6083 6084
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
{
	struct btrfs_root *root = BTRFS_I(inode)->root;
6085 6086
	u64 to_free = 0;
	unsigned dropped;
6087 6088

	num_bytes = ALIGN(num_bytes, root->sectorsize);
6089
	spin_lock(&BTRFS_I(inode)->lock);
6090
	dropped = drop_outstanding_extent(inode, num_bytes);
6091

6092 6093
	if (num_bytes)
		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
6094
	spin_unlock(&BTRFS_I(inode)->lock);
6095 6096
	if (dropped > 0)
		to_free += btrfs_calc_trans_metadata_size(root, dropped);
6097

6098
	if (btrfs_is_testing(root->fs_info))
6099 6100
		return;

J
Josef Bacik 已提交
6101 6102
	trace_btrfs_space_reservation(root->fs_info, "delalloc",
				      btrfs_ino(inode), to_free, 0);
6103

6104 6105 6106 6107
	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
				to_free);
}

6108
/**
6109
 * btrfs_delalloc_reserve_space - reserve data and metadata space for
6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132
 * delalloc
 * @inode: inode we're writing to
 * @start: start range we are writing to
 * @len: how long the range we are writing to
 *
 * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
 *
 * This will do the following things
 *
 * o reserve space in data space info for num bytes
 *   and reserve precious corresponding qgroup space
 *   (Done in check_data_free_space)
 *
 * o reserve space for metadata space, based on the number of outstanding
 *   extents and how much csums will be needed
 *   also reserve metadata space in a per root over-reserve method.
 * o add to the inodes->delalloc_bytes
 * o add it to the fs_info's delalloc inodes list.
 *   (Above 3 all done in delalloc_reserve_metadata)
 *
 * Return 0 for success
 * Return <0 for error(-ENOSPC or -EQUOT)
 */
6133
int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
6134 6135 6136
{
	int ret;

6137
	ret = btrfs_check_data_free_space(inode, start, len);
6138 6139 6140 6141
	if (ret < 0)
		return ret;
	ret = btrfs_delalloc_reserve_metadata(inode, len);
	if (ret < 0)
6142
		btrfs_free_reserved_data_space(inode, start, len);
6143 6144 6145
	return ret;
}

6146
/**
6147
 * btrfs_delalloc_release_space - release data and metadata space for delalloc
6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160
 * @inode: inode we're releasing space for
 * @start: start position of the space already reserved
 * @len: the len of the space already reserved
 *
 * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
 * called in the case that we don't need the metadata AND data reservations
 * anymore.  So if there is an error or we insert an inline extent.
 *
 * This function will release the metadata space that was not used and will
 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
 * list if there are no delalloc bytes left.
 * Also it will handle the qgroup reserved space.
 */
6161
void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
6162 6163
{
	btrfs_delalloc_release_metadata(inode, len);
6164
	btrfs_free_reserved_data_space(inode, start, len);
6165 6166
}

6167 6168 6169
static int update_block_group(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root, u64 bytenr,
			      u64 num_bytes, int alloc)
C
Chris Mason 已提交
6170
{
6171
	struct btrfs_block_group_cache *cache = NULL;
C
Chris Mason 已提交
6172
	struct btrfs_fs_info *info = root->fs_info;
6173
	u64 total = num_bytes;
C
Chris Mason 已提交
6174
	u64 old_val;
6175
	u64 byte_in_group;
6176
	int factor;
C
Chris Mason 已提交
6177

6178
	/* block accounting for super block */
6179
	spin_lock(&info->delalloc_root_lock);
6180
	old_val = btrfs_super_bytes_used(info->super_copy);
6181 6182 6183 6184
	if (alloc)
		old_val += num_bytes;
	else
		old_val -= num_bytes;
6185
	btrfs_set_super_bytes_used(info->super_copy, old_val);
6186
	spin_unlock(&info->delalloc_root_lock);
6187

C
Chris Mason 已提交
6188
	while (total) {
6189
		cache = btrfs_lookup_block_group(info, bytenr);
6190
		if (!cache)
6191
			return -ENOENT;
6192 6193 6194 6195 6196 6197
		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
				    BTRFS_BLOCK_GROUP_RAID1 |
				    BTRFS_BLOCK_GROUP_RAID10))
			factor = 2;
		else
			factor = 1;
6198 6199 6200 6201 6202 6203 6204
		/*
		 * If this block group has free space cache written out, we
		 * need to make sure to load it if we are removing space.  This
		 * is because we need the unpinning stage to actually add the
		 * space back to the block group, otherwise we will leak space.
		 */
		if (!alloc && cache->cached == BTRFS_CACHE_NO)
6205
			cache_block_group(cache, 1);
6206

6207 6208
		byte_in_group = bytenr - cache->key.objectid;
		WARN_ON(byte_in_group > cache->key.offset);
C
Chris Mason 已提交
6209

6210
		spin_lock(&cache->space_info->lock);
6211
		spin_lock(&cache->lock);
6212

6213
		if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
6214 6215 6216
		    cache->disk_cache_state < BTRFS_DC_CLEAR)
			cache->disk_cache_state = BTRFS_DC_CLEAR;

C
Chris Mason 已提交
6217
		old_val = btrfs_block_group_used(&cache->item);
6218
		num_bytes = min(total, cache->key.offset - byte_in_group);
C
Chris Mason 已提交
6219
		if (alloc) {
6220
			old_val += num_bytes;
6221 6222 6223
			btrfs_set_block_group_used(&cache->item, old_val);
			cache->reserved -= num_bytes;
			cache->space_info->bytes_reserved -= num_bytes;
6224 6225
			cache->space_info->bytes_used += num_bytes;
			cache->space_info->disk_used += num_bytes * factor;
6226
			spin_unlock(&cache->lock);
6227
			spin_unlock(&cache->space_info->lock);
C
Chris Mason 已提交
6228
		} else {
6229
			old_val -= num_bytes;
6230 6231 6232 6233 6234 6235 6236
			btrfs_set_block_group_used(&cache->item, old_val);
			cache->pinned += num_bytes;
			cache->space_info->bytes_pinned += num_bytes;
			cache->space_info->bytes_used -= num_bytes;
			cache->space_info->disk_used -= num_bytes * factor;
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);
6237

J
Josef Bacik 已提交
6238 6239 6240
			trace_btrfs_space_reservation(root->fs_info, "pinned",
						      cache->space_info->flags,
						      num_bytes, 1);
6241 6242 6243
			set_extent_dirty(info->pinned_extents,
					 bytenr, bytenr + num_bytes - 1,
					 GFP_NOFS | __GFP_NOFAIL);
C
Chris Mason 已提交
6244
		}
6245 6246 6247 6248 6249 6250 6251 6252 6253 6254

		spin_lock(&trans->transaction->dirty_bgs_lock);
		if (list_empty(&cache->dirty_list)) {
			list_add_tail(&cache->dirty_list,
				      &trans->transaction->dirty_bgs);
				trans->transaction->num_dirty_bgs++;
			btrfs_get_block_group(cache);
		}
		spin_unlock(&trans->transaction->dirty_bgs_lock);

6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270
		/*
		 * No longer have used bytes in this block group, queue it for
		 * deletion. We do this after adding the block group to the
		 * dirty list to avoid races between cleaner kthread and space
		 * cache writeout.
		 */
		if (!alloc && old_val == 0) {
			spin_lock(&info->unused_bgs_lock);
			if (list_empty(&cache->bg_list)) {
				btrfs_get_block_group(cache);
				list_add_tail(&cache->bg_list,
					      &info->unused_bgs);
			}
			spin_unlock(&info->unused_bgs_lock);
		}

6271
		btrfs_put_block_group(cache);
6272 6273
		total -= num_bytes;
		bytenr += num_bytes;
C
Chris Mason 已提交
6274 6275 6276
	}
	return 0;
}
6277

6278 6279
static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
{
J
Josef Bacik 已提交
6280
	struct btrfs_block_group_cache *cache;
6281
	u64 bytenr;
J
Josef Bacik 已提交
6282

6283 6284 6285 6286 6287 6288 6289
	spin_lock(&root->fs_info->block_group_cache_lock);
	bytenr = root->fs_info->first_logical_byte;
	spin_unlock(&root->fs_info->block_group_cache_lock);

	if (bytenr < (u64)-1)
		return bytenr;

J
Josef Bacik 已提交
6290 6291
	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
	if (!cache)
6292
		return 0;
J
Josef Bacik 已提交
6293

6294
	bytenr = cache->key.objectid;
6295
	btrfs_put_block_group(cache);
6296 6297

	return bytenr;
6298 6299
}

6300 6301 6302
static int pin_down_extent(struct btrfs_root *root,
			   struct btrfs_block_group_cache *cache,
			   u64 bytenr, u64 num_bytes, int reserved)
6303
{
6304 6305 6306 6307 6308 6309 6310 6311 6312 6313
	spin_lock(&cache->space_info->lock);
	spin_lock(&cache->lock);
	cache->pinned += num_bytes;
	cache->space_info->bytes_pinned += num_bytes;
	if (reserved) {
		cache->reserved -= num_bytes;
		cache->space_info->bytes_reserved -= num_bytes;
	}
	spin_unlock(&cache->lock);
	spin_unlock(&cache->space_info->lock);
J
Josef Bacik 已提交
6314

J
Josef Bacik 已提交
6315 6316
	trace_btrfs_space_reservation(root->fs_info, "pinned",
				      cache->space_info->flags, num_bytes, 1);
6317 6318 6319 6320
	set_extent_dirty(root->fs_info->pinned_extents, bytenr,
			 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
	return 0;
}
J
Josef Bacik 已提交
6321

6322 6323 6324 6325 6326 6327 6328
/*
 * this function must be called within transaction
 */
int btrfs_pin_extent(struct btrfs_root *root,
		     u64 bytenr, u64 num_bytes, int reserved)
{
	struct btrfs_block_group_cache *cache;
J
Josef Bacik 已提交
6329

6330
	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6331
	BUG_ON(!cache); /* Logic error */
6332 6333 6334 6335

	pin_down_extent(root, cache, bytenr, num_bytes, reserved);

	btrfs_put_block_group(cache);
6336 6337 6338
	return 0;
}

6339
/*
6340 6341
 * this function must be called within transaction
 */
6342
int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
6343 6344 6345
				    u64 bytenr, u64 num_bytes)
{
	struct btrfs_block_group_cache *cache;
6346
	int ret;
6347 6348

	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
6349 6350
	if (!cache)
		return -EINVAL;
6351 6352 6353 6354 6355 6356 6357

	/*
	 * pull in the free space cache (if any) so that our pin
	 * removes the free space from the cache.  We have load_only set
	 * to one because the slow code to read in the free extents does check
	 * the pinned extents.
	 */
6358
	cache_block_group(cache, 1);
6359 6360 6361 6362

	pin_down_extent(root, cache, bytenr, num_bytes, 0);

	/* remove us from the free space cache (if we're there at all) */
6363
	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
6364
	btrfs_put_block_group(cache);
6365
	return ret;
6366 6367
}

6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441
static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
{
	int ret;
	struct btrfs_block_group_cache *block_group;
	struct btrfs_caching_control *caching_ctl;

	block_group = btrfs_lookup_block_group(root->fs_info, start);
	if (!block_group)
		return -EINVAL;

	cache_block_group(block_group, 0);
	caching_ctl = get_caching_control(block_group);

	if (!caching_ctl) {
		/* Logic error */
		BUG_ON(!block_group_cache_done(block_group));
		ret = btrfs_remove_free_space(block_group, start, num_bytes);
	} else {
		mutex_lock(&caching_ctl->mutex);

		if (start >= caching_ctl->progress) {
			ret = add_excluded_extent(root, start, num_bytes);
		} else if (start + num_bytes <= caching_ctl->progress) {
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
		} else {
			num_bytes = caching_ctl->progress - start;
			ret = btrfs_remove_free_space(block_group,
						      start, num_bytes);
			if (ret)
				goto out_lock;

			num_bytes = (start + num_bytes) -
				caching_ctl->progress;
			start = caching_ctl->progress;
			ret = add_excluded_extent(root, start, num_bytes);
		}
out_lock:
		mutex_unlock(&caching_ctl->mutex);
		put_caching_control(caching_ctl);
	}
	btrfs_put_block_group(block_group);
	return ret;
}

int btrfs_exclude_logged_extents(struct btrfs_root *log,
				 struct extent_buffer *eb)
{
	struct btrfs_file_extent_item *item;
	struct btrfs_key key;
	int found_type;
	int i;

	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
		return 0;

	for (i = 0; i < btrfs_header_nritems(eb); i++) {
		btrfs_item_key_to_cpu(eb, &key, i);
		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;
		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		found_type = btrfs_file_extent_type(eb, item);
		if (found_type == BTRFS_FILE_EXTENT_INLINE)
			continue;
		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
			continue;
		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
		__exclude_logged_extent(log, key.objectid, key.offset);
	}

	return 0;
}

6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492
static void
btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
{
	atomic_inc(&bg->reservations);
}

void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
					const u64 start)
{
	struct btrfs_block_group_cache *bg;

	bg = btrfs_lookup_block_group(fs_info, start);
	ASSERT(bg);
	if (atomic_dec_and_test(&bg->reservations))
		wake_up_atomic_t(&bg->reservations);
	btrfs_put_block_group(bg);
}

static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
{
	schedule();
	return 0;
}

void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
{
	struct btrfs_space_info *space_info = bg->space_info;

	ASSERT(bg->ro);

	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
		return;

	/*
	 * Our block group is read only but before we set it to read only,
	 * some task might have had allocated an extent from it already, but it
	 * has not yet created a respective ordered extent (and added it to a
	 * root's list of ordered extents).
	 * Therefore wait for any task currently allocating extents, since the
	 * block group's reservations counter is incremented while a read lock
	 * on the groups' semaphore is held and decremented after releasing
	 * the read access on that semaphore and creating the ordered extent.
	 */
	down_write(&space_info->groups_sem);
	up_write(&space_info->groups_sem);

	wait_on_atomic_t(&bg->reservations,
			 btrfs_wait_bg_reservations_atomic_t,
			 TASK_UNINTERRUPTIBLE);
}

6493
/**
6494
 * btrfs_add_reserved_bytes - update the block_group and space info counters
6495
 * @cache:	The cache we are manipulating
6496 6497
 * @ram_bytes:  The number of bytes of file content, and will be same to
 *              @num_bytes except for the compress path.
6498
 * @num_bytes:	The number of bytes in question
6499
 * @delalloc:   The blocks are allocated for the delalloc write
6500
 *
6501 6502
 * This is called by the allocator when it reserves space. Metadata
 * reservations should be called with RESERVE_ALLOC so we do the proper
6503 6504 6505 6506 6507 6508 6509 6510
 * ENOSPC accounting.  For data we handle the reservation through clearing the
 * delalloc bits in the io_tree.  We have to do this since we could end up
 * allocating less disk space for the amount of data we have reserved in the
 * case of compression.
 *
 * If this is a reservation and the block group has become read only we cannot
 * make the reservation and return -EAGAIN, otherwise this function always
 * succeeds.
6511
 */
6512
static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
6513
				    u64 ram_bytes, u64 num_bytes, int delalloc)
6514
{
6515
	struct btrfs_space_info *space_info = cache->space_info;
6516
	int ret = 0;
6517

6518 6519
	spin_lock(&space_info->lock);
	spin_lock(&cache->lock);
6520 6521
	if (cache->ro) {
		ret = -EAGAIN;
6522
	} else {
6523 6524
		cache->reserved += num_bytes;
		space_info->bytes_reserved += num_bytes;
6525

6526 6527 6528 6529
		trace_btrfs_space_reservation(cache->fs_info,
				"space_info", space_info->flags,
				ram_bytes, 0);
		space_info->bytes_may_use -= ram_bytes;
6530
		if (delalloc)
6531
			cache->delalloc_bytes += num_bytes;
6532
	}
6533 6534
	spin_unlock(&cache->lock);
	spin_unlock(&space_info->lock);
6535
	return ret;
6536
}
C
Chris Mason 已提交
6537

6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568
/**
 * btrfs_free_reserved_bytes - update the block_group and space info counters
 * @cache:      The cache we are manipulating
 * @num_bytes:  The number of bytes in question
 * @delalloc:   The blocks are allocated for the delalloc write
 *
 * This is called by somebody who is freeing space that was never actually used
 * on disk.  For example if you reserve some space for a new leaf in transaction
 * A and before transaction A commits you free that leaf, you call this with
 * reserve set to 0 in order to clear the reservation.
 */

static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
				     u64 num_bytes, int delalloc)
{
	struct btrfs_space_info *space_info = cache->space_info;
	int ret = 0;

	spin_lock(&space_info->lock);
	spin_lock(&cache->lock);
	if (cache->ro)
		space_info->bytes_readonly += num_bytes;
	cache->reserved -= num_bytes;
	space_info->bytes_reserved -= num_bytes;

	if (delalloc)
		cache->delalloc_bytes -= num_bytes;
	spin_unlock(&cache->lock);
	spin_unlock(&space_info->lock);
	return ret;
}
6569
void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6570
				struct btrfs_root *root)
6571 6572
{
	struct btrfs_fs_info *fs_info = root->fs_info;
6573 6574 6575
	struct btrfs_caching_control *next;
	struct btrfs_caching_control *caching_ctl;
	struct btrfs_block_group_cache *cache;
6576

6577
	down_write(&fs_info->commit_root_sem);
6578

6579 6580 6581 6582 6583 6584 6585
	list_for_each_entry_safe(caching_ctl, next,
				 &fs_info->caching_block_groups, list) {
		cache = caching_ctl->block_group;
		if (block_group_cache_done(cache)) {
			cache->last_byte_to_unpin = (u64)-1;
			list_del_init(&caching_ctl->list);
			put_caching_control(caching_ctl);
6586
		} else {
6587
			cache->last_byte_to_unpin = caching_ctl->progress;
6588 6589
		}
	}
6590 6591 6592 6593 6594 6595

	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		fs_info->pinned_extents = &fs_info->freed_extents[1];
	else
		fs_info->pinned_extents = &fs_info->freed_extents[0];

6596
	up_write(&fs_info->commit_root_sem);
6597 6598

	update_global_block_rsv(fs_info);
6599 6600
}

6601 6602 6603 6604 6605 6606 6607 6608 6609
/*
 * Returns the free cluster for the given space info and sets empty_cluster to
 * what it should be based on the mount options.
 */
static struct btrfs_free_cluster *
fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
		   u64 *empty_cluster)
{
	struct btrfs_free_cluster *ret = NULL;
6610
	bool ssd = btrfs_test_opt(root->fs_info, SSD);
6611 6612 6613 6614 6615 6616

	*empty_cluster = 0;
	if (btrfs_mixed_space_info(space_info))
		return ret;

	if (ssd)
6617
		*empty_cluster = SZ_2M;
6618 6619 6620
	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
		ret = &root->fs_info->meta_alloc_cluster;
		if (!ssd)
6621
			*empty_cluster = SZ_64K;
6622 6623 6624 6625 6626 6627 6628
	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
		ret = &root->fs_info->data_alloc_cluster;
	}

	return ret;
}

6629 6630
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
			      const bool return_free_space)
C
Chris Mason 已提交
6631
{
6632 6633
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_block_group_cache *cache = NULL;
6634 6635
	struct btrfs_space_info *space_info;
	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6636
	struct btrfs_free_cluster *cluster = NULL;
6637
	u64 len;
6638 6639
	u64 total_unpinned = 0;
	u64 empty_cluster = 0;
6640
	bool readonly;
C
Chris Mason 已提交
6641

6642
	while (start <= end) {
6643
		readonly = false;
6644 6645 6646 6647
		if (!cache ||
		    start >= cache->key.objectid + cache->key.offset) {
			if (cache)
				btrfs_put_block_group(cache);
6648
			total_unpinned = 0;
6649
			cache = btrfs_lookup_block_group(fs_info, start);
6650
			BUG_ON(!cache); /* Logic error */
6651 6652 6653 6654 6655

			cluster = fetch_cluster_info(root,
						     cache->space_info,
						     &empty_cluster);
			empty_cluster <<= 1;
6656 6657 6658 6659 6660 6661 6662
		}

		len = cache->key.objectid + cache->key.offset - start;
		len = min(len, end + 1 - start);

		if (start < cache->last_byte_to_unpin) {
			len = min(len, cache->last_byte_to_unpin - start);
6663 6664
			if (return_free_space)
				btrfs_add_free_space(cache, start, len);
6665 6666
		}

6667
		start += len;
6668
		total_unpinned += len;
6669
		space_info = cache->space_info;
6670

6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683
		/*
		 * If this space cluster has been marked as fragmented and we've
		 * unpinned enough in this block group to potentially allow a
		 * cluster to be created inside of it go ahead and clear the
		 * fragmented check.
		 */
		if (cluster && cluster->fragmented &&
		    total_unpinned > empty_cluster) {
			spin_lock(&cluster->lock);
			cluster->fragmented = 0;
			spin_unlock(&cluster->lock);
		}

6684
		spin_lock(&space_info->lock);
6685 6686
		spin_lock(&cache->lock);
		cache->pinned -= len;
6687
		space_info->bytes_pinned -= len;
J
Josef Bacik 已提交
6688 6689 6690

		trace_btrfs_space_reservation(fs_info, "pinned",
					      space_info->flags, len, 0);
6691
		space_info->max_extent_size = 0;
6692
		percpu_counter_add(&space_info->total_bytes_pinned, -len);
6693 6694 6695 6696
		if (cache->ro) {
			space_info->bytes_readonly += len;
			readonly = true;
		}
6697
		spin_unlock(&cache->lock);
6698 6699 6700 6701
		if (!readonly && return_free_space &&
		    global_rsv->space_info == space_info) {
			u64 to_add = len;
			WARN_ON(!return_free_space);
6702 6703
			spin_lock(&global_rsv->lock);
			if (!global_rsv->full) {
6704 6705 6706 6707
				to_add = min(len, global_rsv->size -
					     global_rsv->reserved);
				global_rsv->reserved += to_add;
				space_info->bytes_may_use += to_add;
6708 6709
				if (global_rsv->reserved >= global_rsv->size)
					global_rsv->full = 1;
6710 6711 6712 6713 6714
				trace_btrfs_space_reservation(fs_info,
							      "space_info",
							      space_info->flags,
							      to_add, 1);
				len -= to_add;
6715 6716
			}
			spin_unlock(&global_rsv->lock);
6717 6718 6719 6720
			/* Add to any tickets we may have */
			if (len)
				space_info_add_new_bytes(fs_info, space_info,
							 len);
6721 6722
		}
		spin_unlock(&space_info->lock);
C
Chris Mason 已提交
6723
	}
6724 6725 6726

	if (cache)
		btrfs_put_block_group(cache);
C
Chris Mason 已提交
6727 6728 6729 6730
	return 0;
}

int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
6731
			       struct btrfs_root *root)
6732
{
6733
	struct btrfs_fs_info *fs_info = root->fs_info;
6734 6735
	struct btrfs_block_group_cache *block_group, *tmp;
	struct list_head *deleted_bgs;
6736
	struct extent_io_tree *unpin;
6737 6738
	u64 start;
	u64 end;
6739 6740
	int ret;

6741 6742 6743 6744 6745
	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
		unpin = &fs_info->freed_extents[1];
	else
		unpin = &fs_info->freed_extents[0];

6746
	while (!trans->aborted) {
6747
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
6748
		ret = find_first_extent_bit(unpin, 0, &start, &end,
6749
					    EXTENT_DIRTY, NULL);
6750 6751
		if (ret) {
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6752
			break;
6753
		}
6754

6755
		if (btrfs_test_opt(root->fs_info, DISCARD))
6756 6757
			ret = btrfs_discard_extent(root, start,
						   end + 1 - start, NULL);
6758

6759
		clear_extent_dirty(unpin, start, end);
6760
		unpin_extent_range(root, start, end, true);
6761
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
6762
		cond_resched();
6763
	}
J
Josef Bacik 已提交
6764

6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792
	/*
	 * Transaction is finished.  We don't need the lock anymore.  We
	 * do need to clean up the block groups in case of a transaction
	 * abort.
	 */
	deleted_bgs = &trans->transaction->deleted_bgs;
	list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
		u64 trimmed = 0;

		ret = -EROFS;
		if (!trans->aborted)
			ret = btrfs_discard_extent(root,
						   block_group->key.objectid,
						   block_group->key.offset,
						   &trimmed);

		list_del_init(&block_group->bg_list);
		btrfs_put_block_group_trimming(block_group);
		btrfs_put_block_group(block_group);

		if (ret) {
			const char *errstr = btrfs_decode_error(ret);
			btrfs_warn(fs_info,
				   "Discard failed while removing blockgroup: errno=%d %s\n",
				   ret, errstr);
		}
	}

C
Chris Mason 已提交
6793 6794 6795
	return 0;
}

6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816
static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
			     u64 owner, u64 root_objectid)
{
	struct btrfs_space_info *space_info;
	u64 flags;

	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
			flags = BTRFS_BLOCK_GROUP_SYSTEM;
		else
			flags = BTRFS_BLOCK_GROUP_METADATA;
	} else {
		flags = BTRFS_BLOCK_GROUP_DATA;
	}

	space_info = __find_space_info(fs_info, flags);
	BUG_ON(!space_info); /* Logic bug */
	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
}


6817 6818
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
				struct btrfs_root *root,
6819
				struct btrfs_delayed_ref_node *node, u64 parent,
6820 6821
				u64 root_objectid, u64 owner_objectid,
				u64 owner_offset, int refs_to_drop,
6822
				struct btrfs_delayed_extent_op *extent_op)
6823
{
C
Chris Mason 已提交
6824
	struct btrfs_key key;
6825
	struct btrfs_path *path;
6826 6827
	struct btrfs_fs_info *info = root->fs_info;
	struct btrfs_root *extent_root = info->extent_root;
6828
	struct extent_buffer *leaf;
6829 6830
	struct btrfs_extent_item *ei;
	struct btrfs_extent_inline_ref *iref;
6831
	int ret;
6832
	int is_data;
6833 6834 6835
	int extent_slot = 0;
	int found_extent = 0;
	int num_to_del = 1;
6836 6837
	u32 item_size;
	u64 refs;
6838 6839
	u64 bytenr = node->bytenr;
	u64 num_bytes = node->num_bytes;
J
Josef Bacik 已提交
6840
	int last_ref = 0;
6841 6842
	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
						 SKINNY_METADATA);
C
Chris Mason 已提交
6843

6844
	path = btrfs_alloc_path();
6845 6846
	if (!path)
		return -ENOMEM;
6847

6848
	path->reada = READA_FORWARD;
6849
	path->leave_spinning = 1;
6850 6851 6852 6853

	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
	BUG_ON(!is_data && refs_to_drop != 1);

6854 6855 6856
	if (is_data)
		skinny_metadata = 0;

6857 6858 6859 6860
	ret = lookup_extent_backref(trans, extent_root, path, &iref,
				    bytenr, num_bytes, parent,
				    root_objectid, owner_objectid,
				    owner_offset);
6861
	if (ret == 0) {
6862
		extent_slot = path->slots[0];
6863 6864
		while (extent_slot >= 0) {
			btrfs_item_key_to_cpu(path->nodes[0], &key,
6865
					      extent_slot);
6866
			if (key.objectid != bytenr)
6867
				break;
6868 6869
			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
			    key.offset == num_bytes) {
6870 6871 6872
				found_extent = 1;
				break;
			}
6873 6874 6875 6876 6877
			if (key.type == BTRFS_METADATA_ITEM_KEY &&
			    key.offset == owner_objectid) {
				found_extent = 1;
				break;
			}
6878 6879
			if (path->slots[0] - extent_slot > 5)
				break;
6880
			extent_slot--;
6881
		}
6882 6883 6884 6885 6886
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
		if (found_extent && item_size < sizeof(*ei))
			found_extent = 0;
#endif
Z
Zheng Yan 已提交
6887
		if (!found_extent) {
6888
			BUG_ON(iref);
6889
			ret = remove_extent_backref(trans, extent_root, path,
6890
						    NULL, refs_to_drop,
J
Josef Bacik 已提交
6891
						    is_data, &last_ref);
6892
			if (ret) {
6893
				btrfs_abort_transaction(trans, ret);
6894 6895
				goto out;
			}
6896
			btrfs_release_path(path);
6897
			path->leave_spinning = 1;
6898 6899 6900 6901 6902

			key.objectid = bytenr;
			key.type = BTRFS_EXTENT_ITEM_KEY;
			key.offset = num_bytes;

6903 6904 6905 6906 6907
			if (!is_data && skinny_metadata) {
				key.type = BTRFS_METADATA_ITEM_KEY;
				key.offset = owner_objectid;
			}

Z
Zheng Yan 已提交
6908 6909
			ret = btrfs_search_slot(trans, extent_root,
						&key, path, -1, 1);
6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925
			if (ret > 0 && skinny_metadata && path->slots[0]) {
				/*
				 * Couldn't find our skinny metadata item,
				 * see if we have ye olde extent item.
				 */
				path->slots[0]--;
				btrfs_item_key_to_cpu(path->nodes[0], &key,
						      path->slots[0]);
				if (key.objectid == bytenr &&
				    key.type == BTRFS_EXTENT_ITEM_KEY &&
				    key.offset == num_bytes)
					ret = 0;
			}

			if (ret > 0 && skinny_metadata) {
				skinny_metadata = false;
6926
				key.objectid = bytenr;
6927 6928 6929 6930 6931 6932 6933
				key.type = BTRFS_EXTENT_ITEM_KEY;
				key.offset = num_bytes;
				btrfs_release_path(path);
				ret = btrfs_search_slot(trans, extent_root,
							&key, path, -1, 1);
			}

6934
			if (ret) {
6935
				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6936
					ret, bytenr);
6937 6938 6939
				if (ret > 0)
					btrfs_print_leaf(extent_root,
							 path->nodes[0]);
6940
			}
6941
			if (ret < 0) {
6942
				btrfs_abort_transaction(trans, ret);
6943 6944
				goto out;
			}
Z
Zheng Yan 已提交
6945 6946
			extent_slot = path->slots[0];
		}
6947
	} else if (WARN_ON(ret == -ENOENT)) {
6948
		btrfs_print_leaf(extent_root, path->nodes[0]);
6949 6950
		btrfs_err(info,
			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
6951 6952
			bytenr, parent, root_objectid, owner_objectid,
			owner_offset);
6953
		btrfs_abort_transaction(trans, ret);
6954
		goto out;
6955
	} else {
6956
		btrfs_abort_transaction(trans, ret);
6957
		goto out;
6958
	}
6959 6960

	leaf = path->nodes[0];
6961 6962 6963 6964 6965 6966
	item_size = btrfs_item_size_nr(leaf, extent_slot);
#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
	if (item_size < sizeof(*ei)) {
		BUG_ON(found_extent || extent_slot != path->slots[0]);
		ret = convert_extent_item_v0(trans, extent_root, path,
					     owner_objectid, 0);
6967
		if (ret < 0) {
6968
			btrfs_abort_transaction(trans, ret);
6969 6970
			goto out;
		}
6971

6972
		btrfs_release_path(path);
6973 6974 6975 6976 6977 6978 6979 6980 6981
		path->leave_spinning = 1;

		key.objectid = bytenr;
		key.type = BTRFS_EXTENT_ITEM_KEY;
		key.offset = num_bytes;

		ret = btrfs_search_slot(trans, extent_root, &key, path,
					-1, 1);
		if (ret) {
6982
			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
6983
				ret, bytenr);
6984 6985
			btrfs_print_leaf(extent_root, path->nodes[0]);
		}
6986
		if (ret < 0) {
6987
			btrfs_abort_transaction(trans, ret);
6988 6989 6990
			goto out;
		}

6991 6992 6993 6994 6995 6996
		extent_slot = path->slots[0];
		leaf = path->nodes[0];
		item_size = btrfs_item_size_nr(leaf, extent_slot);
	}
#endif
	BUG_ON(item_size < sizeof(*ei));
6997
	ei = btrfs_item_ptr(leaf, extent_slot,
C
Chris Mason 已提交
6998
			    struct btrfs_extent_item);
6999 7000
	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
	    key.type == BTRFS_EXTENT_ITEM_KEY) {
7001 7002 7003 7004 7005
		struct btrfs_tree_block_info *bi;
		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
		bi = (struct btrfs_tree_block_info *)(ei + 1);
		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
	}
7006

7007
	refs = btrfs_extent_refs(leaf, ei);
7008 7009
	if (refs < refs_to_drop) {
		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
7010
			  "for bytenr %Lu", refs_to_drop, refs, bytenr);
7011
		ret = -EINVAL;
7012
		btrfs_abort_transaction(trans, ret);
7013 7014
		goto out;
	}
7015
	refs -= refs_to_drop;
7016

7017 7018 7019 7020 7021 7022
	if (refs > 0) {
		if (extent_op)
			__run_delayed_extent_op(extent_op, leaf, ei);
		/*
		 * In the case of inline back ref, reference count will
		 * be updated by remove_extent_backref
7023
		 */
7024 7025 7026 7027 7028 7029 7030 7031 7032
		if (iref) {
			BUG_ON(!found_extent);
		} else {
			btrfs_set_extent_refs(leaf, ei, refs);
			btrfs_mark_buffer_dirty(leaf);
		}
		if (found_extent) {
			ret = remove_extent_backref(trans, extent_root, path,
						    iref, refs_to_drop,
J
Josef Bacik 已提交
7033
						    is_data, &last_ref);
7034
			if (ret) {
7035
				btrfs_abort_transaction(trans, ret);
7036 7037
				goto out;
			}
7038
		}
7039 7040
		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
				 root_objectid);
7041 7042 7043
	} else {
		if (found_extent) {
			BUG_ON(is_data && refs_to_drop !=
7044
			       extent_data_ref_count(path, iref));
7045 7046 7047 7048 7049 7050 7051
			if (iref) {
				BUG_ON(path->slots[0] != extent_slot);
			} else {
				BUG_ON(path->slots[0] != extent_slot + 1);
				path->slots[0] = extent_slot;
				num_to_del = 2;
			}
C
Chris Mason 已提交
7052
		}
7053

J
Josef Bacik 已提交
7054
		last_ref = 1;
7055 7056
		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
				      num_to_del);
7057
		if (ret) {
7058
			btrfs_abort_transaction(trans, ret);
7059 7060
			goto out;
		}
7061
		btrfs_release_path(path);
7062

7063
		if (is_data) {
7064
			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
7065
			if (ret) {
7066
				btrfs_abort_transaction(trans, ret);
7067 7068
				goto out;
			}
7069 7070
		}

7071 7072 7073
		ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
					     num_bytes);
		if (ret) {
7074
			btrfs_abort_transaction(trans, ret);
7075 7076 7077
			goto out;
		}

7078
		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
7079
		if (ret) {
7080
			btrfs_abort_transaction(trans, ret);
7081 7082
			goto out;
		}
7083
	}
J
Josef Bacik 已提交
7084 7085
	btrfs_release_path(path);

7086
out:
7087
	btrfs_free_path(path);
7088 7089 7090
	return ret;
}

7091
/*
7092
 * when we free an block, it is possible (and likely) that we free the last
7093 7094 7095 7096 7097 7098 7099 7100 7101
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
 */
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root, u64 bytenr)
{
	struct btrfs_delayed_ref_head *head;
	struct btrfs_delayed_ref_root *delayed_refs;
7102
	int ret = 0;
7103 7104 7105 7106 7107

	delayed_refs = &trans->transaction->delayed_refs;
	spin_lock(&delayed_refs->lock);
	head = btrfs_find_delayed_ref_head(trans, bytenr);
	if (!head)
7108
		goto out_delayed_unlock;
7109

7110
	spin_lock(&head->lock);
7111
	if (!list_empty(&head->ref_list))
7112 7113
		goto out;

7114 7115 7116
	if (head->extent_op) {
		if (!head->must_insert_reserved)
			goto out;
7117
		btrfs_free_delayed_extent_op(head->extent_op);
7118 7119 7120
		head->extent_op = NULL;
	}

7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132
	/*
	 * waiting for the lock here would deadlock.  If someone else has it
	 * locked they are already in the process of dropping it anyway
	 */
	if (!mutex_trylock(&head->mutex))
		goto out;

	/*
	 * at this point we have a head with no other entries.  Go
	 * ahead and process it.
	 */
	head->node.in_tree = 0;
L
Liu Bo 已提交
7133
	rb_erase(&head->href_node, &delayed_refs->href_root);
7134

7135
	atomic_dec(&delayed_refs->num_entries);
7136 7137 7138 7139 7140

	/*
	 * we don't take a ref on the node because we're removing it from the
	 * tree, so we just steal the ref the tree was holding.
	 */
7141
	delayed_refs->num_heads--;
7142
	if (head->processing == 0)
7143
		delayed_refs->num_heads_ready--;
7144 7145
	head->processing = 0;
	spin_unlock(&head->lock);
7146 7147
	spin_unlock(&delayed_refs->lock);

7148 7149 7150 7151 7152
	BUG_ON(head->extent_op);
	if (head->must_insert_reserved)
		ret = 1;

	mutex_unlock(&head->mutex);
7153
	btrfs_put_delayed_ref(&head->node);
7154
	return ret;
7155
out:
7156
	spin_unlock(&head->lock);
7157 7158

out_delayed_unlock:
7159 7160 7161 7162
	spin_unlock(&delayed_refs->lock);
	return 0;
}

7163 7164 7165
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root,
			   struct extent_buffer *buf,
7166
			   u64 parent, int last_ref)
7167
{
7168
	int pin = 1;
7169 7170 7171
	int ret;

	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
A
Arne Jansen 已提交
7172 7173 7174 7175
		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
					buf->start, buf->len,
					parent, root->root_key.objectid,
					btrfs_header_level(buf),
7176
					BTRFS_DROP_DELAYED_REF, NULL);
7177
		BUG_ON(ret); /* -ENOMEM */
7178 7179 7180 7181 7182 7183
	}

	if (!last_ref)
		return;

	if (btrfs_header_generation(buf) == trans->transid) {
7184 7185
		struct btrfs_block_group_cache *cache;

7186 7187 7188
		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
			ret = check_ref_cleanup(trans, root, buf->start);
			if (!ret)
7189
				goto out;
7190 7191
		}

7192 7193
		cache = btrfs_lookup_block_group(root->fs_info, buf->start);

7194 7195
		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
			pin_down_extent(root, cache, buf->start, buf->len, 1);
7196
			btrfs_put_block_group(cache);
7197
			goto out;
7198 7199 7200 7201 7202
		}

		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));

		btrfs_add_free_space(cache, buf->start, buf->len);
7203
		btrfs_free_reserved_bytes(cache, buf->len, 0);
7204
		btrfs_put_block_group(cache);
J
Josef Bacik 已提交
7205
		trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
7206
		pin = 0;
7207 7208
	}
out:
7209 7210 7211 7212 7213
	if (pin)
		add_pinned_bytes(root->fs_info, buf->len,
				 btrfs_header_level(buf),
				 root->root_key.objectid);

7214 7215 7216 7217 7218
	/*
	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
	 * anymore.
	 */
	clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
7219 7220
}

7221
/* Can return -ENOMEM */
A
Arne Jansen 已提交
7222 7223
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
7224
		      u64 owner, u64 offset)
7225 7226
{
	int ret;
A
Arne Jansen 已提交
7227
	struct btrfs_fs_info *fs_info = root->fs_info;
7228

7229
	if (btrfs_is_testing(fs_info))
7230
		return 0;
7231

7232 7233
	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);

7234 7235 7236 7237
	/*
	 * tree log blocks never actually go into the extent allocation
	 * tree, just update pinning info and exit early.
	 */
7238 7239
	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
7240
		/* unlocks the pinned mutex */
7241
		btrfs_pin_extent(root, bytenr, num_bytes, 1);
7242
		ret = 0;
7243
	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
A
Arne Jansen 已提交
7244 7245
		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
					num_bytes,
7246
					parent, root_objectid, (int)owner,
7247
					BTRFS_DROP_DELAYED_REF, NULL);
7248
	} else {
A
Arne Jansen 已提交
7249 7250 7251
		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
						num_bytes,
						parent, root_objectid, owner,
7252 7253
						offset, 0,
						BTRFS_DROP_DELAYED_REF, NULL);
7254
	}
7255 7256 7257
	return ret;
}

J
Josef Bacik 已提交
7258 7259 7260 7261 7262 7263 7264 7265 7266 7267
/*
 * when we wait for progress in the block group caching, its because
 * our allocation attempt failed at least once.  So, we must sleep
 * and let some progress happen before we try again.
 *
 * This function will sleep at least once waiting for new free space to
 * show up, and then it will check the block group free space numbers
 * for our min num_bytes.  Another option is to have it go ahead
 * and look in the rbtree for a free extent of a given size, but this
 * is a good start.
7268 7269 7270
 *
 * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
 * any of the information in this block group.
J
Josef Bacik 已提交
7271
 */
7272
static noinline void
J
Josef Bacik 已提交
7273 7274 7275
wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
				u64 num_bytes)
{
7276
	struct btrfs_caching_control *caching_ctl;
J
Josef Bacik 已提交
7277

7278 7279
	caching_ctl = get_caching_control(cache);
	if (!caching_ctl)
7280
		return;
J
Josef Bacik 已提交
7281

7282
	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
7283
		   (cache->free_space_ctl->free_space >= num_bytes));
7284 7285 7286 7287 7288 7289 7290 7291

	put_caching_control(caching_ctl);
}

static noinline int
wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
{
	struct btrfs_caching_control *caching_ctl;
7292
	int ret = 0;
7293 7294 7295

	caching_ctl = get_caching_control(cache);
	if (!caching_ctl)
7296
		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
7297 7298

	wait_event(caching_ctl->wait, block_group_cache_done(cache));
7299 7300
	if (cache->cached == BTRFS_CACHE_ERROR)
		ret = -EIO;
7301
	put_caching_control(caching_ctl);
7302
	return ret;
J
Josef Bacik 已提交
7303 7304
}

7305
int __get_raid_index(u64 flags)
7306
{
7307
	if (flags & BTRFS_BLOCK_GROUP_RAID10)
7308
		return BTRFS_RAID_RAID10;
7309
	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
7310
		return BTRFS_RAID_RAID1;
7311
	else if (flags & BTRFS_BLOCK_GROUP_DUP)
7312
		return BTRFS_RAID_DUP;
7313
	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
7314
		return BTRFS_RAID_RAID0;
D
David Woodhouse 已提交
7315
	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
7316
		return BTRFS_RAID_RAID5;
D
David Woodhouse 已提交
7317
	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
7318
		return BTRFS_RAID_RAID6;
7319

7320
	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
7321 7322
}

7323
int get_block_group_index(struct btrfs_block_group_cache *cache)
7324
{
7325
	return __get_raid_index(cache->flags);
7326 7327
}

7328 7329 7330 7331 7332 7333 7334 7335 7336 7337
static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10]	= "raid10",
	[BTRFS_RAID_RAID1]	= "raid1",
	[BTRFS_RAID_DUP]	= "dup",
	[BTRFS_RAID_RAID0]	= "raid0",
	[BTRFS_RAID_SINGLE]	= "single",
	[BTRFS_RAID_RAID5]	= "raid5",
	[BTRFS_RAID_RAID6]	= "raid6",
};

7338
static const char *get_raid_name(enum btrfs_raid_types type)
7339 7340 7341 7342 7343 7344 7345
{
	if (type >= BTRFS_NR_RAID_TYPES)
		return NULL;

	return btrfs_raid_type_names[type];
}

J
Josef Bacik 已提交
7346
enum btrfs_loop_type {
7347 7348 7349 7350
	LOOP_CACHING_NOWAIT = 0,
	LOOP_CACHING_WAIT = 1,
	LOOP_ALLOC_CHUNK = 2,
	LOOP_NO_EMPTY_SIZE = 3,
J
Josef Bacik 已提交
7351 7352
};

7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374
static inline void
btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static inline void
btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
		       int delalloc)
{
	btrfs_get_block_group(cache);
	if (delalloc)
		down_read(&cache->data_rwsem);
}

static struct btrfs_block_group_cache *
btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
		   struct btrfs_free_cluster *cluster,
		   int delalloc)
{
S
Sudip Mukherjee 已提交
7375
	struct btrfs_block_group_cache *used_bg = NULL;
7376

7377
	spin_lock(&cluster->refill_lock);
7378 7379 7380 7381 7382 7383
	while (1) {
		used_bg = cluster->block_group;
		if (!used_bg)
			return NULL;

		if (used_bg == block_group)
7384 7385
			return used_bg;

7386
		btrfs_get_block_group(used_bg);
7387

7388 7389
		if (!delalloc)
			return used_bg;
7390

7391 7392
		if (down_read_trylock(&used_bg->data_rwsem))
			return used_bg;
7393

7394
		spin_unlock(&cluster->refill_lock);
7395

7396
		down_read(&used_bg->data_rwsem);
7397

7398 7399 7400
		spin_lock(&cluster->refill_lock);
		if (used_bg == cluster->block_group)
			return used_bg;
7401

7402 7403 7404
		up_read(&used_bg->data_rwsem);
		btrfs_put_block_group(used_bg);
	}
7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415
}

static inline void
btrfs_release_block_group(struct btrfs_block_group_cache *cache,
			 int delalloc)
{
	if (delalloc)
		up_read(&cache->data_rwsem);
	btrfs_put_block_group(cache);
}

7416 7417 7418
/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
7419
 * ins->objectid == start position
7420
 * ins->flags = BTRFS_EXTENT_ITEM_KEY
7421
 * ins->offset == the size of the hole.
7422
 * Any available blocks before search_start are skipped.
7423 7424 7425
 *
 * If there is no suitable free space, we will record the max size of
 * the free space extent currently.
7426
 */
7427
static noinline int find_free_extent(struct btrfs_root *orig_root,
7428 7429 7430
				u64 ram_bytes, u64 num_bytes, u64 empty_size,
				u64 hint_byte, struct btrfs_key *ins,
				u64 flags, int delalloc)
7431
{
7432
	int ret = 0;
C
Chris Mason 已提交
7433
	struct btrfs_root *root = orig_root->fs_info->extent_root;
7434
	struct btrfs_free_cluster *last_ptr = NULL;
7435
	struct btrfs_block_group_cache *block_group = NULL;
7436
	u64 search_start = 0;
7437
	u64 max_extent_size = 0;
7438
	u64 empty_cluster = 0;
7439
	struct btrfs_space_info *space_info;
7440
	int loop = 0;
7441
	int index = __get_raid_index(flags);
7442
	bool failed_cluster_refill = false;
7443
	bool failed_alloc = false;
7444
	bool use_cluster = true;
7445
	bool have_caching_bg = false;
7446
	bool orig_have_caching_bg = false;
7447
	bool full_search = false;
7448

7449
	WARN_ON(num_bytes < root->sectorsize);
7450
	ins->type = BTRFS_EXTENT_ITEM_KEY;
7451 7452
	ins->objectid = 0;
	ins->offset = 0;
7453

7454
	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
J
Josef Bacik 已提交
7455

7456
	space_info = __find_space_info(root->fs_info, flags);
7457
	if (!space_info) {
7458
		btrfs_err(root->fs_info, "No space info for %llu", flags);
7459 7460
		return -ENOSPC;
	}
J
Josef Bacik 已提交
7461

7462
	/*
7463 7464 7465 7466 7467 7468 7469 7470
	 * If our free space is heavily fragmented we may not be able to make
	 * big contiguous allocations, so instead of doing the expensive search
	 * for free space, simply return ENOSPC with our max_extent_size so we
	 * can go ahead and search for a more manageable chunk.
	 *
	 * If our max_extent_size is large enough for our allocation simply
	 * disable clustering since we will likely not be able to find enough
	 * space to create a cluster and induce latency trying.
7471
	 */
7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482
	if (unlikely(space_info->max_extent_size)) {
		spin_lock(&space_info->lock);
		if (space_info->max_extent_size &&
		    num_bytes > space_info->max_extent_size) {
			ins->offset = space_info->max_extent_size;
			spin_unlock(&space_info->lock);
			return -ENOSPC;
		} else if (space_info->max_extent_size) {
			use_cluster = false;
		}
		spin_unlock(&space_info->lock);
7483
	}
J
Josef Bacik 已提交
7484

7485
	last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
7486
	if (last_ptr) {
7487 7488 7489
		spin_lock(&last_ptr->lock);
		if (last_ptr->block_group)
			hint_byte = last_ptr->window_start;
7490 7491 7492 7493 7494 7495 7496 7497 7498
		if (last_ptr->fragmented) {
			/*
			 * We still set window_start so we can keep track of the
			 * last place we found an allocation to try and save
			 * some time.
			 */
			hint_byte = last_ptr->window_start;
			use_cluster = false;
		}
7499
		spin_unlock(&last_ptr->lock);
7500
	}
7501

7502
	search_start = max(search_start, first_logical_byte(root, 0));
7503
	search_start = max(search_start, hint_byte);
J
Josef Bacik 已提交
7504 7505 7506
	if (search_start == hint_byte) {
		block_group = btrfs_lookup_block_group(root->fs_info,
						       search_start);
J
Josef Bacik 已提交
7507 7508 7509
		/*
		 * we don't want to use the block group if it doesn't match our
		 * allocation bits, or if its not cached.
7510 7511 7512
		 *
		 * However if we are re-searching with an ideal block group
		 * picked out then we don't care that the block group is cached.
J
Josef Bacik 已提交
7513
		 */
7514
		if (block_group && block_group_bits(block_group, flags) &&
7515
		    block_group->cached != BTRFS_CACHE_NO) {
J
Josef Bacik 已提交
7516
			down_read(&space_info->groups_sem);
7517 7518 7519 7520 7521 7522 7523 7524 7525 7526
			if (list_empty(&block_group->list) ||
			    block_group->ro) {
				/*
				 * someone is removing this block group,
				 * we can't jump into the have_block_group
				 * target because our list pointers are not
				 * valid
				 */
				btrfs_put_block_group(block_group);
				up_read(&space_info->groups_sem);
7527
			} else {
7528
				index = get_block_group_index(block_group);
7529
				btrfs_lock_block_group(block_group, delalloc);
7530
				goto have_block_group;
7531
			}
J
Josef Bacik 已提交
7532
		} else if (block_group) {
7533
			btrfs_put_block_group(block_group);
J
Josef Bacik 已提交
7534
		}
7535
	}
J
Josef Bacik 已提交
7536
search:
7537
	have_caching_bg = false;
7538 7539
	if (index == 0 || index == __get_raid_index(flags))
		full_search = true;
7540
	down_read(&space_info->groups_sem);
7541 7542
	list_for_each_entry(block_group, &space_info->block_groups[index],
			    list) {
7543
		u64 offset;
J
Josef Bacik 已提交
7544
		int cached;
7545

7546
		btrfs_grab_block_group(block_group, delalloc);
J
Josef Bacik 已提交
7547
		search_start = block_group->key.objectid;
7548

7549 7550 7551 7552 7553
		/*
		 * this can happen if we end up cycling through all the
		 * raid types, but we want to make sure we only allocate
		 * for the proper type.
		 */
7554
		if (!block_group_bits(block_group, flags)) {
7555 7556
		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
				BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
7557 7558
				BTRFS_BLOCK_GROUP_RAID5 |
				BTRFS_BLOCK_GROUP_RAID6 |
7559 7560 7561 7562 7563 7564 7565
				BTRFS_BLOCK_GROUP_RAID10;

			/*
			 * if they asked for extra copies and this block group
			 * doesn't provide them, bail.  This does allow us to
			 * fill raid0 from raid1.
			 */
7566
			if ((flags & extra) && !(block_group->flags & extra))
7567 7568 7569
				goto loop;
		}

J
Josef Bacik 已提交
7570
have_block_group:
7571 7572
		cached = block_group_cache_done(block_group);
		if (unlikely(!cached)) {
7573
			have_caching_bg = true;
7574
			ret = cache_block_group(block_group, 0);
7575 7576
			BUG_ON(ret < 0);
			ret = 0;
J
Josef Bacik 已提交
7577 7578
		}

7579 7580
		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
			goto loop;
7581
		if (unlikely(block_group->ro))
J
Josef Bacik 已提交
7582
			goto loop;
J
Josef Bacik 已提交
7583

7584
		/*
7585 7586
		 * Ok we want to try and use the cluster allocator, so
		 * lets look there
7587
		 */
7588
		if (last_ptr && use_cluster) {
7589
			struct btrfs_block_group_cache *used_block_group;
7590
			unsigned long aligned_cluster;
7591 7592 7593 7594
			/*
			 * the refill lock keeps out other
			 * people trying to start a new cluster
			 */
7595 7596 7597 7598
			used_block_group = btrfs_lock_cluster(block_group,
							      last_ptr,
							      delalloc);
			if (!used_block_group)
7599
				goto refill_cluster;
7600

7601 7602 7603 7604
			if (used_block_group != block_group &&
			    (used_block_group->ro ||
			     !block_group_bits(used_block_group, flags)))
				goto release_cluster;
7605

7606
			offset = btrfs_alloc_from_cluster(used_block_group,
7607 7608 7609 7610
						last_ptr,
						num_bytes,
						used_block_group->key.objectid,
						&max_extent_size);
7611 7612 7613
			if (offset) {
				/* we have a block, we're done */
				spin_unlock(&last_ptr->refill_lock);
J
Josef Bacik 已提交
7614
				trace_btrfs_reserve_extent_cluster(root,
7615 7616
						used_block_group,
						search_start, num_bytes);
7617
				if (used_block_group != block_group) {
7618 7619
					btrfs_release_block_group(block_group,
								  delalloc);
7620 7621
					block_group = used_block_group;
				}
7622 7623 7624
				goto checks;
			}

7625
			WARN_ON(last_ptr->block_group != used_block_group);
7626
release_cluster:
7627 7628 7629 7630 7631 7632 7633 7634
			/* If we are on LOOP_NO_EMPTY_SIZE, we can't
			 * set up a new clusters, so lets just skip it
			 * and let the allocator find whatever block
			 * it can find.  If we reach this point, we
			 * will have tried the cluster allocator
			 * plenty of times and not have found
			 * anything, so we are likely way too
			 * fragmented for the clustering stuff to find
7635 7636 7637 7638 7639 7640 7641 7642
			 * anything.
			 *
			 * However, if the cluster is taken from the
			 * current block group, release the cluster
			 * first, so that we stand a better chance of
			 * succeeding in the unclustered
			 * allocation.  */
			if (loop >= LOOP_NO_EMPTY_SIZE &&
7643
			    used_block_group != block_group) {
7644
				spin_unlock(&last_ptr->refill_lock);
7645 7646
				btrfs_release_block_group(used_block_group,
							  delalloc);
7647 7648 7649
				goto unclustered_alloc;
			}

7650 7651 7652 7653 7654 7655
			/*
			 * this cluster didn't work out, free it and
			 * start over
			 */
			btrfs_return_cluster_to_free_space(NULL, last_ptr);

7656 7657 7658 7659
			if (used_block_group != block_group)
				btrfs_release_block_group(used_block_group,
							  delalloc);
refill_cluster:
7660 7661 7662 7663 7664
			if (loop >= LOOP_NO_EMPTY_SIZE) {
				spin_unlock(&last_ptr->refill_lock);
				goto unclustered_alloc;
			}

7665 7666 7667 7668
			aligned_cluster = max_t(unsigned long,
						empty_cluster + empty_size,
					      block_group->full_stripe_len);

7669
			/* allocate a cluster in this block group */
7670 7671 7672 7673
			ret = btrfs_find_space_cluster(root, block_group,
						       last_ptr, search_start,
						       num_bytes,
						       aligned_cluster);
7674 7675 7676 7677 7678 7679
			if (ret == 0) {
				/*
				 * now pull our allocation out of this
				 * cluster
				 */
				offset = btrfs_alloc_from_cluster(block_group,
7680 7681 7682 7683
							last_ptr,
							num_bytes,
							search_start,
							&max_extent_size);
7684 7685 7686
				if (offset) {
					/* we found one, proceed */
					spin_unlock(&last_ptr->refill_lock);
J
Josef Bacik 已提交
7687 7688 7689
					trace_btrfs_reserve_extent_cluster(root,
						block_group, search_start,
						num_bytes);
7690 7691
					goto checks;
				}
7692 7693
			} else if (!cached && loop > LOOP_CACHING_NOWAIT
				   && !failed_cluster_refill) {
J
Josef Bacik 已提交
7694 7695
				spin_unlock(&last_ptr->refill_lock);

7696
				failed_cluster_refill = true;
J
Josef Bacik 已提交
7697 7698 7699
				wait_block_group_cache_progress(block_group,
				       num_bytes + empty_cluster + empty_size);
				goto have_block_group;
7700
			}
J
Josef Bacik 已提交
7701

7702 7703 7704 7705 7706 7707
			/*
			 * at this point we either didn't find a cluster
			 * or we weren't able to allocate a block from our
			 * cluster.  Free the cluster we've been trying
			 * to use, and go to the next block group
			 */
7708
			btrfs_return_cluster_to_free_space(NULL, last_ptr);
7709
			spin_unlock(&last_ptr->refill_lock);
7710
			goto loop;
7711 7712
		}

7713
unclustered_alloc:
7714 7715 7716 7717 7718 7719 7720 7721 7722 7723
		/*
		 * We are doing an unclustered alloc, set the fragmented flag so
		 * we don't bother trying to setup a cluster again until we get
		 * more space.
		 */
		if (unlikely(last_ptr)) {
			spin_lock(&last_ptr->lock);
			last_ptr->fragmented = 1;
			spin_unlock(&last_ptr->lock);
		}
7724 7725 7726 7727
		spin_lock(&block_group->free_space_ctl->tree_lock);
		if (cached &&
		    block_group->free_space_ctl->free_space <
		    num_bytes + empty_cluster + empty_size) {
7728 7729 7730 7731
			if (block_group->free_space_ctl->free_space >
			    max_extent_size)
				max_extent_size =
					block_group->free_space_ctl->free_space;
7732 7733 7734 7735 7736
			spin_unlock(&block_group->free_space_ctl->tree_lock);
			goto loop;
		}
		spin_unlock(&block_group->free_space_ctl->tree_lock);

7737
		offset = btrfs_find_space_for_alloc(block_group, search_start,
7738 7739
						    num_bytes, empty_size,
						    &max_extent_size);
7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750
		/*
		 * If we didn't find a chunk, and we haven't failed on this
		 * block group before, and this block group is in the middle of
		 * caching and we are ok with waiting, then go ahead and wait
		 * for progress to be made, and set failed_alloc to true.
		 *
		 * If failed_alloc is true then we've already waited on this
		 * block group once and should move on to the next block group.
		 */
		if (!offset && !failed_alloc && !cached &&
		    loop > LOOP_CACHING_NOWAIT) {
J
Josef Bacik 已提交
7751
			wait_block_group_cache_progress(block_group,
7752 7753
						num_bytes + empty_size);
			failed_alloc = true;
J
Josef Bacik 已提交
7754
			goto have_block_group;
7755 7756
		} else if (!offset) {
			goto loop;
J
Josef Bacik 已提交
7757
		}
7758
checks:
7759
		search_start = ALIGN(offset, root->stripesize);
7760

J
Josef Bacik 已提交
7761 7762
		/* move on to the next group */
		if (search_start + num_bytes >
7763 7764
		    block_group->key.objectid + block_group->key.offset) {
			btrfs_add_free_space(block_group, offset, num_bytes);
J
Josef Bacik 已提交
7765
			goto loop;
7766
		}
7767

7768
		if (offset < search_start)
7769
			btrfs_add_free_space(block_group, offset,
7770 7771
					     search_start - offset);
		BUG_ON(offset > search_start);
J
Josef Bacik 已提交
7772

7773 7774
		ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
				num_bytes, delalloc);
7775
		if (ret == -EAGAIN) {
7776
			btrfs_add_free_space(block_group, offset, num_bytes);
J
Josef Bacik 已提交
7777
			goto loop;
J
Josef Bacik 已提交
7778
		}
7779
		btrfs_inc_block_group_reservations(block_group);
7780

7781
		/* we are all good, lets return */
J
Josef Bacik 已提交
7782 7783
		ins->objectid = search_start;
		ins->offset = num_bytes;
7784

J
Josef Bacik 已提交
7785 7786
		trace_btrfs_reserve_extent(orig_root, block_group,
					   search_start, num_bytes);
7787
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
7788 7789
		break;
loop:
7790
		failed_cluster_refill = false;
7791
		failed_alloc = false;
7792
		BUG_ON(index != get_block_group_index(block_group));
7793
		btrfs_release_block_group(block_group, delalloc);
J
Josef Bacik 已提交
7794 7795 7796
	}
	up_read(&space_info->groups_sem);

7797 7798 7799 7800
	if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
		&& !orig_have_caching_bg)
		orig_have_caching_bg = true;

7801 7802 7803
	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
		goto search;

7804 7805 7806
	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
		goto search;

7807
	/*
7808 7809
	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
	 *			caching kthreads as we move along
J
Josef Bacik 已提交
7810 7811 7812 7813
	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
	 * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
	 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
	 *			again
7814
	 */
7815
	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7816
		index = 0;
7817 7818 7819
		if (loop == LOOP_CACHING_NOWAIT) {
			/*
			 * We want to skip the LOOP_CACHING_WAIT step if we
7820
			 * don't have any uncached bgs and we've already done a
7821 7822
			 * full search through.
			 */
7823
			if (orig_have_caching_bg || !full_search)
7824 7825 7826 7827 7828 7829 7830
				loop = LOOP_CACHING_WAIT;
			else
				loop = LOOP_ALLOC_CHUNK;
		} else {
			loop++;
		}

J
Josef Bacik 已提交
7831
		if (loop == LOOP_ALLOC_CHUNK) {
7832
			struct btrfs_trans_handle *trans;
7833 7834 7835 7836 7837 7838 7839
			int exist = 0;

			trans = current->journal_info;
			if (trans)
				exist = 1;
			else
				trans = btrfs_join_transaction(root);
7840 7841 7842 7843 7844 7845

			if (IS_ERR(trans)) {
				ret = PTR_ERR(trans);
				goto out;
			}

7846
			ret = do_chunk_alloc(trans, root, flags,
7847
					     CHUNK_ALLOC_FORCE);
7848 7849 7850 7851 7852 7853 7854 7855 7856

			/*
			 * If we can't allocate a new chunk we've already looped
			 * through at least once, move on to the NO_EMPTY_SIZE
			 * case.
			 */
			if (ret == -ENOSPC)
				loop = LOOP_NO_EMPTY_SIZE;

7857 7858 7859 7860
			/*
			 * Do not bail out on ENOSPC since we
			 * can do more things.
			 */
7861
			if (ret < 0 && ret != -ENOSPC)
7862
				btrfs_abort_transaction(trans, ret);
7863 7864
			else
				ret = 0;
7865 7866
			if (!exist)
				btrfs_end_transaction(trans, root);
7867
			if (ret)
7868
				goto out;
J
Josef Bacik 已提交
7869 7870
		}

7871
		if (loop == LOOP_NO_EMPTY_SIZE) {
7872 7873 7874 7875 7876 7877 7878 7879 7880
			/*
			 * Don't loop again if we already have no empty_size and
			 * no empty_cluster.
			 */
			if (empty_size == 0 &&
			    empty_cluster == 0) {
				ret = -ENOSPC;
				goto out;
			}
7881 7882
			empty_size = 0;
			empty_cluster = 0;
7883
		}
7884 7885

		goto search;
J
Josef Bacik 已提交
7886 7887
	} else if (!ins->objectid) {
		ret = -ENOSPC;
7888
	} else if (ins->objectid) {
7889 7890 7891 7892 7893
		if (!use_cluster && last_ptr) {
			spin_lock(&last_ptr->lock);
			last_ptr->window_start = ins->objectid;
			spin_unlock(&last_ptr->lock);
		}
7894
		ret = 0;
C
Chris Mason 已提交
7895
	}
7896
out:
7897 7898 7899 7900
	if (ret == -ENOSPC) {
		spin_lock(&space_info->lock);
		space_info->max_extent_size = max_extent_size;
		spin_unlock(&space_info->lock);
7901
		ins->offset = max_extent_size;
7902
	}
C
Chris Mason 已提交
7903
	return ret;
7904
}
7905

J
Josef Bacik 已提交
7906 7907
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
			    int dump_block_groups)
J
Josef Bacik 已提交
7908 7909
{
	struct btrfs_block_group_cache *cache;
7910
	int index = 0;
J
Josef Bacik 已提交
7911

J
Josef Bacik 已提交
7912
	spin_lock(&info->lock);
7913
	printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
7914 7915
	       info->flags,
	       info->total_bytes - info->bytes_used - info->bytes_pinned -
7916 7917
	       info->bytes_reserved - info->bytes_readonly -
	       info->bytes_may_use, (info->full) ? "" : "not ");
7918
	printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
7919
	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
7920 7921 7922
	       info->total_bytes, info->bytes_used, info->bytes_pinned,
	       info->bytes_reserved, info->bytes_may_use,
	       info->bytes_readonly);
J
Josef Bacik 已提交
7923 7924 7925 7926
	spin_unlock(&info->lock);

	if (!dump_block_groups)
		return;
J
Josef Bacik 已提交
7927

7928
	down_read(&info->groups_sem);
7929 7930
again:
	list_for_each_entry(cache, &info->block_groups[index], list) {
J
Josef Bacik 已提交
7931
		spin_lock(&cache->lock);
7932 7933 7934
		printk(KERN_INFO "BTRFS: "
			   "block group %llu has %llu bytes, "
			   "%llu used %llu pinned %llu reserved %s\n",
7935 7936 7937
		       cache->key.objectid, cache->key.offset,
		       btrfs_block_group_used(&cache->item), cache->pinned,
		       cache->reserved, cache->ro ? "[readonly]" : "");
J
Josef Bacik 已提交
7938 7939 7940
		btrfs_dump_free_space(cache, bytes);
		spin_unlock(&cache->lock);
	}
7941 7942
	if (++index < BTRFS_NR_RAID_TYPES)
		goto again;
7943
	up_read(&info->groups_sem);
J
Josef Bacik 已提交
7944
}
7945

7946
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
7947 7948
			 u64 num_bytes, u64 min_alloc_size,
			 u64 empty_size, u64 hint_byte,
7949
			 struct btrfs_key *ins, int is_data, int delalloc)
7950
{
7951
	bool final_tried = num_bytes == min_alloc_size;
7952
	u64 flags;
7953
	int ret;
7954

7955
	flags = btrfs_get_alloc_profile(root, is_data);
7956
again:
7957
	WARN_ON(num_bytes < root->sectorsize);
7958 7959
	ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
			       hint_byte, ins, flags, delalloc);
7960 7961 7962 7963
	if (!ret && !is_data) {
		btrfs_dec_block_group_reservations(root->fs_info,
						   ins->objectid);
	} else if (ret == -ENOSPC) {
7964 7965
		if (!final_tried && ins->offset) {
			num_bytes = min(num_bytes >> 1, ins->offset);
7966
			num_bytes = round_down(num_bytes, root->sectorsize);
7967
			num_bytes = max(num_bytes, min_alloc_size);
7968
			ram_bytes = num_bytes;
7969 7970 7971
			if (num_bytes == min_alloc_size)
				final_tried = true;
			goto again;
7972
		} else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
7973 7974
			struct btrfs_space_info *sinfo;

7975
			sinfo = __find_space_info(root->fs_info, flags);
7976
			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
7977
				flags, num_bytes);
7978 7979
			if (sinfo)
				dump_space_info(sinfo, num_bytes, 1);
7980
		}
7981
	}
J
Josef Bacik 已提交
7982 7983

	return ret;
7984 7985
}

7986
static int __btrfs_free_reserved_extent(struct btrfs_root *root,
7987 7988
					u64 start, u64 len,
					int pin, int delalloc)
7989
{
J
Josef Bacik 已提交
7990
	struct btrfs_block_group_cache *cache;
7991
	int ret = 0;
J
Josef Bacik 已提交
7992 7993 7994

	cache = btrfs_lookup_block_group(root->fs_info, start);
	if (!cache) {
7995
		btrfs_err(root->fs_info, "Unable to find block group for %llu",
7996
			start);
J
Josef Bacik 已提交
7997 7998
		return -ENOSPC;
	}
7999

8000 8001 8002
	if (pin)
		pin_down_extent(root, cache, start, len, 1);
	else {
8003
		if (btrfs_test_opt(root->fs_info, DISCARD))
8004
			ret = btrfs_discard_extent(root, start, len, NULL);
8005
		btrfs_add_free_space(cache, start, len);
8006
		btrfs_free_reserved_bytes(cache, len, delalloc);
8007
		trace_btrfs_reserved_extent_free(root, start, len);
8008
	}
8009

8010
	btrfs_put_block_group(cache);
8011 8012 8013
	return ret;
}

8014
int btrfs_free_reserved_extent(struct btrfs_root *root,
8015
			       u64 start, u64 len, int delalloc)
8016
{
8017
	return __btrfs_free_reserved_extent(root, start, len, 0, delalloc);
8018 8019 8020 8021 8022
}

int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
				       u64 start, u64 len)
{
8023
	return __btrfs_free_reserved_extent(root, start, len, 1, 0);
8024 8025
}

8026 8027 8028 8029 8030
static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				      struct btrfs_root *root,
				      u64 parent, u64 root_objectid,
				      u64 flags, u64 owner, u64 offset,
				      struct btrfs_key *ins, int ref_mod)
8031 8032
{
	int ret;
8033
	struct btrfs_fs_info *fs_info = root->fs_info;
8034
	struct btrfs_extent_item *extent_item;
8035
	struct btrfs_extent_inline_ref *iref;
8036
	struct btrfs_path *path;
8037 8038 8039
	struct extent_buffer *leaf;
	int type;
	u32 size;
8040

8041 8042 8043 8044
	if (parent > 0)
		type = BTRFS_SHARED_DATA_REF_KEY;
	else
		type = BTRFS_EXTENT_DATA_REF_KEY;
8045

8046
	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
8047 8048

	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
8049 8050
	if (!path)
		return -ENOMEM;
8051

8052
	path->leave_spinning = 1;
8053 8054
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
8055 8056 8057 8058
	if (ret) {
		btrfs_free_path(path);
		return ret;
	}
J
Josef Bacik 已提交
8059

8060 8061
	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
8062
				     struct btrfs_extent_item);
8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082
	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_DATA);

	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
	btrfs_set_extent_inline_ref_type(leaf, iref, type);
	if (parent > 0) {
		struct btrfs_shared_data_ref *ref;
		ref = (struct btrfs_shared_data_ref *)(iref + 1);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
	} else {
		struct btrfs_extent_data_ref *ref;
		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
	}
8083 8084

	btrfs_mark_buffer_dirty(path->nodes[0]);
8085
	btrfs_free_path(path);
8086

8087 8088 8089 8090 8091
	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
					  ins->offset);
	if (ret)
		return ret;

8092
	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
8093
	if (ret) { /* -ENOENT, logic error */
8094
		btrfs_err(fs_info, "update block group failed for %llu %llu",
8095
			ins->objectid, ins->offset);
8096 8097
		BUG();
	}
J
Josef Bacik 已提交
8098
	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
8099 8100 8101
	return ret;
}

8102 8103 8104 8105
static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     u64 parent, u64 root_objectid,
				     u64 flags, struct btrfs_disk_key *key,
8106
				     int level, struct btrfs_key *ins)
8107 8108
{
	int ret;
8109 8110 8111 8112 8113 8114
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_extent_item *extent_item;
	struct btrfs_tree_block_info *block_info;
	struct btrfs_extent_inline_ref *iref;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
8115
	u32 size = sizeof(*extent_item) + sizeof(*iref);
J
Josef Bacik 已提交
8116
	u64 num_bytes = ins->offset;
8117 8118 8119 8120 8121
	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
						 SKINNY_METADATA);

	if (!skinny_metadata)
		size += sizeof(*block_info);
8122

8123
	path = btrfs_alloc_path();
8124 8125
	if (!path) {
		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8126
						   root->nodesize);
8127
		return -ENOMEM;
8128
	}
8129

8130 8131 8132
	path->leave_spinning = 1;
	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
				      ins, size);
8133
	if (ret) {
8134
		btrfs_free_path(path);
8135
		btrfs_free_and_pin_reserved_extent(root, ins->objectid,
8136
						   root->nodesize);
8137 8138
		return ret;
	}
8139 8140 8141 8142 8143 8144 8145 8146 8147

	leaf = path->nodes[0];
	extent_item = btrfs_item_ptr(leaf, path->slots[0],
				     struct btrfs_extent_item);
	btrfs_set_extent_refs(leaf, extent_item, 1);
	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
	btrfs_set_extent_flags(leaf, extent_item,
			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);

8148 8149
	if (skinny_metadata) {
		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
8150
		num_bytes = root->nodesize;
8151 8152 8153 8154 8155 8156
	} else {
		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
		btrfs_set_tree_block_key(leaf, block_info, key);
		btrfs_set_tree_block_level(leaf, block_info, level);
		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
	}
8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171

	if (parent > 0) {
		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_SHARED_BLOCK_REF_KEY);
		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
	} else {
		btrfs_set_extent_inline_ref_type(leaf, iref,
						 BTRFS_TREE_BLOCK_REF_KEY);
		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
	}

	btrfs_mark_buffer_dirty(leaf);
	btrfs_free_path(path);

8172 8173 8174 8175 8176
	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
					  num_bytes);
	if (ret)
		return ret;

8177 8178
	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
				 1);
8179
	if (ret) { /* -ENOENT, logic error */
8180
		btrfs_err(fs_info, "update block group failed for %llu %llu",
8181
			ins->objectid, ins->offset);
8182 8183
		BUG();
	}
J
Josef Bacik 已提交
8184

8185
	trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize);
8186 8187 8188 8189 8190 8191
	return ret;
}

int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     u64 root_objectid, u64 owner,
8192 8193
				     u64 offset, u64 ram_bytes,
				     struct btrfs_key *ins)
8194 8195 8196 8197 8198
{
	int ret;

	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);

A
Arne Jansen 已提交
8199 8200 8201
	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
					 ins->offset, 0,
					 root_objectid, owner, offset,
8202 8203
					 ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
					 NULL);
8204 8205
	return ret;
}
8206 8207 8208 8209 8210 8211

/*
 * this is used by the tree logging recovery code.  It records that
 * an extent has been allocated and makes sure to clear the free
 * space cache bits as well
 */
8212 8213 8214 8215
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   u64 root_objectid, u64 owner, u64 offset,
				   struct btrfs_key *ins)
8216 8217 8218
{
	int ret;
	struct btrfs_block_group_cache *block_group;
8219
	struct btrfs_space_info *space_info;
8220

8221 8222
	/*
	 * Mixed block groups will exclude before processing the log so we only
8223
	 * need to do the exclude dance if this fs isn't mixed.
8224 8225 8226
	 */
	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
8227
		if (ret)
8228
			return ret;
8229 8230
	}

8231 8232 8233 8234
	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
	if (!block_group)
		return -EINVAL;

8235 8236 8237 8238 8239 8240 8241 8242
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
	spin_lock(&block_group->lock);
	space_info->bytes_reserved += ins->offset;
	block_group->reserved += ins->offset;
	spin_unlock(&block_group->lock);
	spin_unlock(&space_info->lock);

8243 8244
	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
					 0, owner, offset, ins, 1);
8245
	btrfs_put_block_group(block_group);
8246 8247 8248
	return ret;
}

8249 8250
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
8251
		      u64 bytenr, int level)
8252 8253 8254
{
	struct extent_buffer *buf;

8255
	buf = btrfs_find_create_tree_block(root, bytenr);
8256 8257 8258
	if (IS_ERR(buf))
		return buf;

8259
	btrfs_set_header_generation(buf, trans->transid);
8260
	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
8261
	btrfs_tree_lock(buf);
8262
	clean_tree_block(trans, root->fs_info, buf);
8263
	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
8264 8265

	btrfs_set_lock_blocking(buf);
8266
	set_extent_buffer_uptodate(buf);
8267

8268
	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
8269
		buf->log_index = root->log_transid % 2;
8270 8271 8272 8273
		/*
		 * we allow two log transactions at a time, use different
		 * EXENT bit to differentiate dirty pages.
		 */
8274
		if (buf->log_index == 0)
8275 8276 8277 8278
			set_extent_dirty(&root->dirty_log_pages, buf->start,
					buf->start + buf->len - 1, GFP_NOFS);
		else
			set_extent_new(&root->dirty_log_pages, buf->start,
8279
					buf->start + buf->len - 1);
8280
	} else {
8281
		buf->log_index = -1;
8282
		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
8283
			 buf->start + buf->len - 1, GFP_NOFS);
8284
	}
8285
	trans->dirty = true;
8286
	/* this returns a buffer locked for blocking */
8287 8288 8289
	return buf;
}

8290 8291 8292 8293 8294
static struct btrfs_block_rsv *
use_block_rsv(struct btrfs_trans_handle *trans,
	      struct btrfs_root *root, u32 blocksize)
{
	struct btrfs_block_rsv *block_rsv;
8295
	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
8296
	int ret;
8297
	bool global_updated = false;
8298 8299 8300

	block_rsv = get_block_rsv(trans, root);

8301 8302
	if (unlikely(block_rsv->size == 0))
		goto try_reserve;
8303
again:
8304 8305 8306 8307
	ret = block_rsv_use_bytes(block_rsv, blocksize);
	if (!ret)
		return block_rsv;

8308 8309 8310
	if (block_rsv->failfast)
		return ERR_PTR(ret);

8311 8312 8313 8314 8315 8316
	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
		global_updated = true;
		update_global_block_rsv(root->fs_info);
		goto again;
	}

8317
	if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
8318 8319 8320 8321 8322
		static DEFINE_RATELIMIT_STATE(_rs,
				DEFAULT_RATELIMIT_INTERVAL * 10,
				/*DEFAULT_RATELIMIT_BURST*/ 1);
		if (__ratelimit(&_rs))
			WARN(1, KERN_DEBUG
8323
				"BTRFS: block rsv returned %d\n", ret);
8324 8325 8326 8327 8328 8329 8330 8331
	}
try_reserve:
	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
				     BTRFS_RESERVE_NO_FLUSH);
	if (!ret)
		return block_rsv;
	/*
	 * If we couldn't reserve metadata bytes try and use some from
8332 8333
	 * the global reserve if its space type is the same as the global
	 * reservation.
8334
	 */
8335 8336
	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
	    block_rsv->space_info == global_rsv->space_info) {
8337 8338 8339 8340 8341
		ret = block_rsv_use_bytes(global_rsv, blocksize);
		if (!ret)
			return global_rsv;
	}
	return ERR_PTR(ret);
8342 8343
}

J
Josef Bacik 已提交
8344 8345
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
8346 8347
{
	block_rsv_add_bytes(block_rsv, blocksize, 0);
J
Josef Bacik 已提交
8348
	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
8349 8350
}

8351
/*
8352
 * finds a free extent and does all the dirty work required for allocation
8353
 * returns the tree buffer or an ERR_PTR on error.
8354
 */
8355 8356
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
					struct btrfs_root *root,
8357 8358
					u64 parent, u64 root_objectid,
					struct btrfs_disk_key *key, int level,
8359
					u64 hint, u64 empty_size)
8360
{
C
Chris Mason 已提交
8361
	struct btrfs_key ins;
8362
	struct btrfs_block_rsv *block_rsv;
8363
	struct extent_buffer *buf;
8364
	struct btrfs_delayed_extent_op *extent_op;
8365 8366
	u64 flags = 0;
	int ret;
8367
	u32 blocksize = root->nodesize;
8368 8369
	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
						 SKINNY_METADATA);
8370

8371
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8372
	if (btrfs_is_testing(root->fs_info)) {
8373
		buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
8374
					    level);
8375 8376 8377 8378
		if (!IS_ERR(buf))
			root->alloc_bytenr += blocksize;
		return buf;
	}
8379
#endif
8380

8381 8382 8383 8384
	block_rsv = use_block_rsv(trans, root, blocksize);
	if (IS_ERR(block_rsv))
		return ERR_CAST(block_rsv);

8385
	ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
8386
				   empty_size, hint, &ins, 0, 0);
8387 8388
	if (ret)
		goto out_unuse;
8389

8390
	buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
8391 8392 8393 8394
	if (IS_ERR(buf)) {
		ret = PTR_ERR(buf);
		goto out_free_reserved;
	}
8395 8396 8397 8398 8399 8400 8401 8402 8403

	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
		if (parent == 0)
			parent = ins.objectid;
		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
	} else
		BUG_ON(parent > 0);

	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
8404
		extent_op = btrfs_alloc_delayed_extent_op();
8405 8406 8407 8408
		if (!extent_op) {
			ret = -ENOMEM;
			goto out_free_buf;
		}
8409 8410 8411 8412 8413
		if (key)
			memcpy(&extent_op->key, key, sizeof(extent_op->key));
		else
			memset(&extent_op->key, 0, sizeof(extent_op->key));
		extent_op->flags_to_set = flags;
8414 8415 8416
		extent_op->update_key = skinny_metadata ? false : true;
		extent_op->update_flags = true;
		extent_op->is_data = false;
8417
		extent_op->level = level;
8418

A
Arne Jansen 已提交
8419
		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
8420 8421 8422
						 ins.objectid, ins.offset,
						 parent, root_objectid, level,
						 BTRFS_ADD_DELAYED_EXTENT,
8423
						 extent_op);
8424 8425
		if (ret)
			goto out_free_delayed;
8426
	}
8427
	return buf;
8428 8429 8430 8431 8432 8433 8434 8435 8436 8437

out_free_delayed:
	btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
	free_extent_buffer(buf);
out_free_reserved:
	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0);
out_unuse:
	unuse_block_rsv(root->fs_info, block_rsv, blocksize);
	return ERR_PTR(ret);
8438
}
8439

8440 8441 8442 8443 8444 8445 8446 8447 8448
struct walk_control {
	u64 refs[BTRFS_MAX_LEVEL];
	u64 flags[BTRFS_MAX_LEVEL];
	struct btrfs_key update_progress;
	int stage;
	int level;
	int shared_level;
	int update_ref;
	int keep_locks;
Y
Yan, Zheng 已提交
8449 8450
	int reada_slot;
	int reada_count;
A
Arne Jansen 已提交
8451
	int for_reloc;
8452 8453 8454 8455 8456
};

#define DROP_REFERENCE	1
#define UPDATE_BACKREF	2

Y
Yan, Zheng 已提交
8457 8458 8459 8460
static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
				     struct btrfs_root *root,
				     struct walk_control *wc,
				     struct btrfs_path *path)
8461
{
Y
Yan, Zheng 已提交
8462 8463 8464
	u64 bytenr;
	u64 generation;
	u64 refs;
8465
	u64 flags;
8466
	u32 nritems;
Y
Yan, Zheng 已提交
8467 8468 8469
	u32 blocksize;
	struct btrfs_key key;
	struct extent_buffer *eb;
8470
	int ret;
Y
Yan, Zheng 已提交
8471 8472
	int slot;
	int nread = 0;
8473

Y
Yan, Zheng 已提交
8474 8475 8476 8477 8478 8479 8480 8481
	if (path->slots[wc->level] < wc->reada_slot) {
		wc->reada_count = wc->reada_count * 2 / 3;
		wc->reada_count = max(wc->reada_count, 2);
	} else {
		wc->reada_count = wc->reada_count * 3 / 2;
		wc->reada_count = min_t(int, wc->reada_count,
					BTRFS_NODEPTRS_PER_BLOCK(root));
	}
8482

Y
Yan, Zheng 已提交
8483 8484
	eb = path->nodes[wc->level];
	nritems = btrfs_header_nritems(eb);
8485
	blocksize = root->nodesize;
8486

Y
Yan, Zheng 已提交
8487 8488 8489
	for (slot = path->slots[wc->level]; slot < nritems; slot++) {
		if (nread >= wc->reada_count)
			break;
8490

C
Chris Mason 已提交
8491
		cond_resched();
Y
Yan, Zheng 已提交
8492 8493
		bytenr = btrfs_node_blockptr(eb, slot);
		generation = btrfs_node_ptr_generation(eb, slot);
C
Chris Mason 已提交
8494

Y
Yan, Zheng 已提交
8495 8496
		if (slot == path->slots[wc->level])
			goto reada;
8497

Y
Yan, Zheng 已提交
8498 8499
		if (wc->stage == UPDATE_BACKREF &&
		    generation <= root->root_key.offset)
8500 8501
			continue;

8502
		/* We don't lock the tree block, it's OK to be racy here */
8503 8504 8505
		ret = btrfs_lookup_extent_info(trans, root, bytenr,
					       wc->level - 1, 1, &refs,
					       &flags);
8506 8507 8508
		/* We don't care about errors in readahead. */
		if (ret < 0)
			continue;
8509 8510
		BUG_ON(refs == 0);

Y
Yan, Zheng 已提交
8511 8512 8513
		if (wc->stage == DROP_REFERENCE) {
			if (refs == 1)
				goto reada;
8514

8515 8516 8517
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
Y
Yan, Zheng 已提交
8518 8519 8520 8521 8522 8523 8524 8525
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				continue;
			btrfs_node_key_to_cpu(eb, &key, slot);
			ret = btrfs_comp_cpu_keys(&key,
						  &wc->update_progress);
			if (ret < 0)
				continue;
8526 8527 8528 8529
		} else {
			if (wc->level == 1 &&
			    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				continue;
8530
		}
Y
Yan, Zheng 已提交
8531
reada:
8532
		readahead_tree_block(root, bytenr);
Y
Yan, Zheng 已提交
8533
		nread++;
C
Chris Mason 已提交
8534
	}
Y
Yan, Zheng 已提交
8535
	wc->reada_slot = slot;
C
Chris Mason 已提交
8536
}
8537

8538 8539 8540 8541 8542
static int account_leaf_items(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *eb)
{
	int nr = btrfs_header_nritems(eb);
8543
	int i, extent_type, ret;
8544 8545 8546 8547
	struct btrfs_key key;
	struct btrfs_file_extent_item *fi;
	u64 bytenr, num_bytes;

8548 8549 8550 8551
	/* We can be called directly from walk_up_proc() */
	if (!root->fs_info->quota_enabled)
		return 0;

8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569
	for (i = 0; i < nr; i++) {
		btrfs_item_key_to_cpu(eb, &key, i);

		if (key.type != BTRFS_EXTENT_DATA_KEY)
			continue;

		fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
		/* filter out non qgroup-accountable extents  */
		extent_type = btrfs_file_extent_type(eb, fi);

		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
			continue;

		bytenr = btrfs_file_extent_disk_bytenr(eb, fi);
		if (!bytenr)
			continue;

		num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
8570

8571 8572
		ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info,
				bytenr, num_bytes, GFP_NOFS);
8573 8574
		if (ret)
			return ret;
8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703
	}
	return 0;
}

/*
 * Walk up the tree from the bottom, freeing leaves and any interior
 * nodes which have had all slots visited. If a node (leaf or
 * interior) is freed, the node above it will have it's slot
 * incremented. The root node will never be freed.
 *
 * At the end of this function, we should have a path which has all
 * slots incremented to the next position for a search. If we need to
 * read a new node it will be NULL and the node above it will have the
 * correct slot selected for a later read.
 *
 * If we increment the root nodes slot counter past the number of
 * elements, 1 is returned to signal completion of the search.
 */
static int adjust_slots_upwards(struct btrfs_root *root,
				struct btrfs_path *path, int root_level)
{
	int level = 0;
	int nr, slot;
	struct extent_buffer *eb;

	if (root_level == 0)
		return 1;

	while (level <= root_level) {
		eb = path->nodes[level];
		nr = btrfs_header_nritems(eb);
		path->slots[level]++;
		slot = path->slots[level];
		if (slot >= nr || level == 0) {
			/*
			 * Don't free the root -  we will detect this
			 * condition after our loop and return a
			 * positive value for caller to stop walking the tree.
			 */
			if (level != root_level) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
				path->locks[level] = 0;

				free_extent_buffer(eb);
				path->nodes[level] = NULL;
				path->slots[level] = 0;
			}
		} else {
			/*
			 * We have a valid slot to walk back down
			 * from. Stop here so caller can process these
			 * new nodes.
			 */
			break;
		}

		level++;
	}

	eb = path->nodes[root_level];
	if (path->slots[root_level] >= btrfs_header_nritems(eb))
		return 1;

	return 0;
}

/*
 * root_eb is the subtree root and is locked before this function is called.
 */
static int account_shared_subtree(struct btrfs_trans_handle *trans,
				  struct btrfs_root *root,
				  struct extent_buffer *root_eb,
				  u64 root_gen,
				  int root_level)
{
	int ret = 0;
	int level;
	struct extent_buffer *eb = root_eb;
	struct btrfs_path *path = NULL;

	BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL);
	BUG_ON(root_eb == NULL);

	if (!root->fs_info->quota_enabled)
		return 0;

	if (!extent_buffer_uptodate(root_eb)) {
		ret = btrfs_read_buffer(root_eb, root_gen);
		if (ret)
			goto out;
	}

	if (root_level == 0) {
		ret = account_leaf_items(trans, root, root_eb);
		goto out;
	}

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	/*
	 * Walk down the tree.  Missing extent blocks are filled in as
	 * we go. Metadata is accounted every time we read a new
	 * extent block.
	 *
	 * When we reach a leaf, we account for file extent items in it,
	 * walk back up the tree (adjusting slot pointers as we go)
	 * and restart the search process.
	 */
	extent_buffer_get(root_eb); /* For path */
	path->nodes[root_level] = root_eb;
	path->slots[root_level] = 0;
	path->locks[root_level] = 0; /* so release_path doesn't try to unlock */
walk_down:
	level = root_level;
	while (level >= 0) {
		if (path->nodes[level] == NULL) {
			int parent_slot;
			u64 child_gen;
			u64 child_bytenr;

			/* We need to get child blockptr/gen from
			 * parent before we can read it. */
			eb = path->nodes[level + 1];
			parent_slot = path->slots[level + 1];
			child_bytenr = btrfs_node_blockptr(eb, parent_slot);
			child_gen = btrfs_node_ptr_generation(eb, parent_slot);

8704
			eb = read_tree_block(root, child_bytenr, child_gen);
8705 8706 8707 8708
			if (IS_ERR(eb)) {
				ret = PTR_ERR(eb);
				goto out;
			} else if (!extent_buffer_uptodate(eb)) {
L
Liu Bo 已提交
8709
				free_extent_buffer(eb);
8710
				ret = -EIO;
8711 8712 8713 8714 8715 8716 8717 8718 8719
				goto out;
			}

			path->nodes[level] = eb;
			path->slots[level] = 0;

			btrfs_tree_read_lock(eb);
			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
			path->locks[level] = BTRFS_READ_LOCK_BLOCKING;
8720

8721 8722 8723
			ret = btrfs_qgroup_insert_dirty_extent(trans,
					root->fs_info, child_bytenr,
					root->nodesize, GFP_NOFS);
8724 8725
			if (ret)
				goto out;
8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751
		}

		if (level == 0) {
			ret = account_leaf_items(trans, root, path->nodes[level]);
			if (ret)
				goto out;

			/* Nonzero return here means we completed our search */
			ret = adjust_slots_upwards(root, path, root_level);
			if (ret)
				break;

			/* Restart search with new slots */
			goto walk_down;
		}

		level--;
	}

	ret = 0;
out:
	btrfs_free_path(path);

	return ret;
}

Y
Yan Zheng 已提交
8752
/*
L
Liu Bo 已提交
8753
 * helper to process tree block while walking down the tree.
8754 8755 8756 8757 8758
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
 *
 * NOTE: return value 1 means we should stop walking down.
Y
Yan Zheng 已提交
8759
 */
8760
static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
8761
				   struct btrfs_root *root,
8762
				   struct btrfs_path *path,
8763
				   struct walk_control *wc, int lookup_info)
Y
Yan Zheng 已提交
8764
{
8765 8766 8767
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
Y
Yan Zheng 已提交
8768 8769
	int ret;

8770 8771 8772
	if (wc->stage == UPDATE_BACKREF &&
	    btrfs_header_owner(eb) != root->root_key.objectid)
		return 1;
Y
Yan Zheng 已提交
8773

8774 8775 8776 8777
	/*
	 * when reference count of tree block is 1, it won't increase
	 * again. once full backref flag is set, we never clear it.
	 */
8778 8779 8780
	if (lookup_info &&
	    ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
8781 8782
		BUG_ON(!path->locks[level]);
		ret = btrfs_lookup_extent_info(trans, root,
8783
					       eb->start, level, 1,
8784 8785
					       &wc->refs[level],
					       &wc->flags[level]);
8786 8787 8788
		BUG_ON(ret == -ENOMEM);
		if (ret)
			return ret;
8789 8790
		BUG_ON(wc->refs[level] == 0);
	}
8791

8792 8793 8794
	if (wc->stage == DROP_REFERENCE) {
		if (wc->refs[level] > 1)
			return 1;
Y
Yan Zheng 已提交
8795

8796
		if (path->locks[level] && !wc->keep_locks) {
8797
			btrfs_tree_unlock_rw(eb, path->locks[level]);
8798 8799 8800 8801
			path->locks[level] = 0;
		}
		return 0;
	}
Y
Yan Zheng 已提交
8802

8803 8804 8805
	/* wc->stage == UPDATE_BACKREF */
	if (!(wc->flags[level] & flag)) {
		BUG_ON(!path->locks[level]);
8806
		ret = btrfs_inc_ref(trans, root, eb, 1);
8807
		BUG_ON(ret); /* -ENOMEM */
8808
		ret = btrfs_dec_ref(trans, root, eb, 0);
8809
		BUG_ON(ret); /* -ENOMEM */
8810
		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
8811 8812
						  eb->len, flag,
						  btrfs_header_level(eb), 0);
8813
		BUG_ON(ret); /* -ENOMEM */
8814 8815 8816 8817 8818 8819 8820 8821
		wc->flags[level] |= flag;
	}

	/*
	 * the block is shared by multiple trees, so it's not good to
	 * keep the tree lock
	 */
	if (path->locks[level] && level > 0) {
8822
		btrfs_tree_unlock_rw(eb, path->locks[level]);
8823 8824 8825 8826 8827
		path->locks[level] = 0;
	}
	return 0;
}

Y
Yan, Zheng 已提交
8828
/*
L
Liu Bo 已提交
8829
 * helper to process tree block pointer.
Y
Yan, Zheng 已提交
8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
 * is shared and we need update back refs for the subtree
 * rooted at the block, this function changes wc->stage to
 * UPDATE_BACKREF. if the block is shared and there is no
 * need to update back, this function drops the reference
 * to the block.
 *
 * NOTE: return value 1 means we should stop walking down.
 */
static noinline int do_walk_down(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
8844
				 struct walk_control *wc, int *lookup_info)
Y
Yan, Zheng 已提交
8845 8846 8847 8848 8849 8850 8851 8852 8853 8854
{
	u64 bytenr;
	u64 generation;
	u64 parent;
	u32 blocksize;
	struct btrfs_key key;
	struct extent_buffer *next;
	int level = wc->level;
	int reada = 0;
	int ret = 0;
8855
	bool need_account = false;
Y
Yan, Zheng 已提交
8856 8857 8858 8859 8860 8861 8862 8863 8864

	generation = btrfs_node_ptr_generation(path->nodes[level],
					       path->slots[level]);
	/*
	 * if the lower level block was created before the snapshot
	 * was created, we know there is no need to update back refs
	 * for the subtree
	 */
	if (wc->stage == UPDATE_BACKREF &&
8865 8866
	    generation <= root->root_key.offset) {
		*lookup_info = 1;
Y
Yan, Zheng 已提交
8867
		return 1;
8868
	}
Y
Yan, Zheng 已提交
8869 8870

	bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
8871
	blocksize = root->nodesize;
Y
Yan, Zheng 已提交
8872

8873
	next = btrfs_find_tree_block(root->fs_info, bytenr);
Y
Yan, Zheng 已提交
8874
	if (!next) {
8875
		next = btrfs_find_create_tree_block(root, bytenr);
8876 8877 8878
		if (IS_ERR(next))
			return PTR_ERR(next);

8879 8880
		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
					       level - 1);
Y
Yan, Zheng 已提交
8881 8882 8883 8884 8885
		reada = 1;
	}
	btrfs_tree_lock(next);
	btrfs_set_lock_blocking(next);

8886
	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
8887 8888
				       &wc->refs[level - 1],
				       &wc->flags[level - 1]);
8889 8890 8891 8892 8893
	if (ret < 0) {
		btrfs_tree_unlock(next);
		return ret;
	}

8894 8895 8896 8897
	if (unlikely(wc->refs[level - 1] == 0)) {
		btrfs_err(root->fs_info, "Missing references.");
		BUG();
	}
8898
	*lookup_info = 0;
Y
Yan, Zheng 已提交
8899

8900
	if (wc->stage == DROP_REFERENCE) {
Y
Yan, Zheng 已提交
8901
		if (wc->refs[level - 1] > 1) {
8902
			need_account = true;
8903 8904 8905 8906
			if (level == 1 &&
			    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
				goto skip;

Y
Yan, Zheng 已提交
8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919
			if (!wc->update_ref ||
			    generation <= root->root_key.offset)
				goto skip;

			btrfs_node_key_to_cpu(path->nodes[level], &key,
					      path->slots[level]);
			ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
			if (ret < 0)
				goto skip;

			wc->stage = UPDATE_BACKREF;
			wc->shared_level = level - 1;
		}
8920 8921 8922 8923
	} else {
		if (level == 1 &&
		    (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
			goto skip;
Y
Yan, Zheng 已提交
8924 8925
	}

8926
	if (!btrfs_buffer_uptodate(next, generation, 0)) {
Y
Yan, Zheng 已提交
8927 8928 8929
		btrfs_tree_unlock(next);
		free_extent_buffer(next);
		next = NULL;
8930
		*lookup_info = 1;
Y
Yan, Zheng 已提交
8931 8932 8933 8934 8935
	}

	if (!next) {
		if (reada && level == 1)
			reada_walk_down(trans, root, wc, path);
8936
		next = read_tree_block(root, bytenr, generation);
8937 8938 8939
		if (IS_ERR(next)) {
			return PTR_ERR(next);
		} else if (!extent_buffer_uptodate(next)) {
8940
			free_extent_buffer(next);
8941
			return -EIO;
8942
		}
Y
Yan, Zheng 已提交
8943 8944 8945 8946 8947 8948 8949 8950
		btrfs_tree_lock(next);
		btrfs_set_lock_blocking(next);
	}

	level--;
	BUG_ON(level != btrfs_header_level(next));
	path->nodes[level] = next;
	path->slots[level] = 0;
8951
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
Y
Yan, Zheng 已提交
8952 8953 8954 8955 8956 8957 8958
	wc->level = level;
	if (wc->level == 1)
		wc->reada_slot = 0;
	return 0;
skip:
	wc->refs[level - 1] = 0;
	wc->flags[level - 1] = 0;
8959 8960 8961 8962 8963 8964 8965 8966
	if (wc->stage == DROP_REFERENCE) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			parent = path->nodes[level]->start;
		} else {
			BUG_ON(root->root_key.objectid !=
			       btrfs_header_owner(path->nodes[level]));
			parent = 0;
		}
Y
Yan, Zheng 已提交
8967

8968 8969 8970 8971
		if (need_account) {
			ret = account_shared_subtree(trans, root, next,
						     generation, level - 1);
			if (ret) {
8972 8973
				btrfs_err_rl(root->fs_info,
					"Error "
8974
					"%d accounting shared subtree. Quota "
8975 8976
					"is out of sync, rescan required.",
					ret);
8977 8978
			}
		}
8979
		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8980
				root->root_key.objectid, level - 1, 0);
8981
		BUG_ON(ret); /* -ENOMEM */
Y
Yan, Zheng 已提交
8982 8983 8984
	}
	btrfs_tree_unlock(next);
	free_extent_buffer(next);
8985
	*lookup_info = 1;
Y
Yan, Zheng 已提交
8986 8987 8988
	return 1;
}

8989
/*
L
Liu Bo 已提交
8990
 * helper to process tree block while walking up the tree.
8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
 *
 * when wc->stage == UPDATE_BACKREF, this function changes
 * wc->stage back to DROP_REFERENCE if we changed wc->stage
 * to UPDATE_BACKREF previously while processing the block.
 *
 * NOTE: return value 1 means we should stop walking up.
 */
static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
				 struct btrfs_root *root,
				 struct btrfs_path *path,
				 struct walk_control *wc)
{
9006
	int ret;
9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032
	int level = wc->level;
	struct extent_buffer *eb = path->nodes[level];
	u64 parent = 0;

	if (wc->stage == UPDATE_BACKREF) {
		BUG_ON(wc->shared_level < level);
		if (level < wc->shared_level)
			goto out;

		ret = find_next_key(path, level + 1, &wc->update_progress);
		if (ret > 0)
			wc->update_ref = 0;

		wc->stage = DROP_REFERENCE;
		wc->shared_level = -1;
		path->slots[level] = 0;

		/*
		 * check reference count again if the block isn't locked.
		 * we should start walking down the tree again if reference
		 * count is one.
		 */
		if (!path->locks[level]) {
			BUG_ON(level == 0);
			btrfs_tree_lock(eb);
			btrfs_set_lock_blocking(eb);
9033
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9034 9035

			ret = btrfs_lookup_extent_info(trans, root,
9036
						       eb->start, level, 1,
9037 9038
						       &wc->refs[level],
						       &wc->flags[level]);
9039 9040
			if (ret < 0) {
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
9041
				path->locks[level] = 0;
9042 9043
				return ret;
			}
9044 9045
			BUG_ON(wc->refs[level] == 0);
			if (wc->refs[level] == 1) {
9046
				btrfs_tree_unlock_rw(eb, path->locks[level]);
L
Liu Bo 已提交
9047
				path->locks[level] = 0;
9048 9049
				return 1;
			}
Y
Yan Zheng 已提交
9050
		}
9051
	}
Y
Yan Zheng 已提交
9052

9053 9054
	/* wc->stage == DROP_REFERENCE */
	BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
9055

9056 9057 9058
	if (wc->refs[level] == 1) {
		if (level == 0) {
			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
9059
				ret = btrfs_dec_ref(trans, root, eb, 1);
9060
			else
9061
				ret = btrfs_dec_ref(trans, root, eb, 0);
9062
			BUG_ON(ret); /* -ENOMEM */
9063 9064
			ret = account_leaf_items(trans, root, eb);
			if (ret) {
9065 9066
				btrfs_err_rl(root->fs_info,
					"error "
9067
					"%d accounting leaf items. Quota "
9068 9069
					"is out of sync, rescan required.",
					ret);
9070
			}
9071 9072 9073 9074 9075 9076
		}
		/* make block locked assertion in clean_tree_block happy */
		if (!path->locks[level] &&
		    btrfs_header_generation(eb) == trans->transid) {
			btrfs_tree_lock(eb);
			btrfs_set_lock_blocking(eb);
9077
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9078
		}
9079
		clean_tree_block(trans, root->fs_info, eb);
9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093
	}

	if (eb == root->node) {
		if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = eb->start;
		else
			BUG_ON(root->root_key.objectid !=
			       btrfs_header_owner(eb));
	} else {
		if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
			parent = path->nodes[level + 1]->start;
		else
			BUG_ON(root->root_key.objectid !=
			       btrfs_header_owner(path->nodes[level + 1]));
Y
Yan Zheng 已提交
9094 9095
	}

9096
	btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
9097 9098 9099
out:
	wc->refs[level] = 0;
	wc->flags[level] = 0;
9100
	return 0;
9101 9102 9103 9104 9105 9106 9107 9108
}

static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct btrfs_path *path,
				   struct walk_control *wc)
{
	int level = wc->level;
9109
	int lookup_info = 1;
9110 9111 9112
	int ret;

	while (level >= 0) {
9113
		ret = walk_down_proc(trans, root, path, wc, lookup_info);
9114 9115 9116 9117 9118 9119
		if (ret > 0)
			break;

		if (level == 0)
			break;

9120 9121 9122 9123
		if (path->slots[level] >=
		    btrfs_header_nritems(path->nodes[level]))
			break;

9124
		ret = do_walk_down(trans, root, path, wc, &lookup_info);
Y
Yan, Zheng 已提交
9125 9126 9127
		if (ret > 0) {
			path->slots[level]++;
			continue;
9128 9129
		} else if (ret < 0)
			return ret;
Y
Yan, Zheng 已提交
9130
		level = wc->level;
Y
Yan Zheng 已提交
9131 9132 9133 9134
	}
	return 0;
}

C
Chris Mason 已提交
9135
static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
9136
				 struct btrfs_root *root,
Y
Yan Zheng 已提交
9137
				 struct btrfs_path *path,
9138
				 struct walk_control *wc, int max_level)
C
Chris Mason 已提交
9139
{
9140
	int level = wc->level;
C
Chris Mason 已提交
9141
	int ret;
9142

9143 9144 9145 9146 9147 9148
	path->slots[level] = btrfs_header_nritems(path->nodes[level]);
	while (level < max_level && path->nodes[level]) {
		wc->level = level;
		if (path->slots[level] + 1 <
		    btrfs_header_nritems(path->nodes[level])) {
			path->slots[level]++;
C
Chris Mason 已提交
9149 9150
			return 0;
		} else {
9151 9152 9153
			ret = walk_up_proc(trans, root, path, wc);
			if (ret > 0)
				return 0;
9154

9155
			if (path->locks[level]) {
9156 9157
				btrfs_tree_unlock_rw(path->nodes[level],
						     path->locks[level]);
9158
				path->locks[level] = 0;
Y
Yan Zheng 已提交
9159
			}
9160 9161 9162
			free_extent_buffer(path->nodes[level]);
			path->nodes[level] = NULL;
			level++;
C
Chris Mason 已提交
9163 9164 9165 9166 9167
		}
	}
	return 1;
}

C
Chris Mason 已提交
9168
/*
9169 9170 9171 9172 9173 9174 9175 9176 9177
 * drop a subvolume tree.
 *
 * this function traverses the tree freeing any blocks that only
 * referenced by the tree.
 *
 * when a shared tree block is found. this function decreases its
 * reference count by one. if update_ref is true, this function
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
D
David Sterba 已提交
9178 9179
 *
 * If called with for_reloc == 0, may exit early with -EAGAIN
C
Chris Mason 已提交
9180
 */
9181
int btrfs_drop_snapshot(struct btrfs_root *root,
A
Arne Jansen 已提交
9182 9183
			 struct btrfs_block_rsv *block_rsv, int update_ref,
			 int for_reloc)
C
Chris Mason 已提交
9184
{
9185
	struct btrfs_path *path;
9186 9187
	struct btrfs_trans_handle *trans;
	struct btrfs_root *tree_root = root->fs_info->tree_root;
9188
	struct btrfs_root_item *root_item = &root->root_item;
9189 9190 9191 9192 9193
	struct walk_control *wc;
	struct btrfs_key key;
	int err = 0;
	int ret;
	int level;
9194
	bool root_dropped = false;
C
Chris Mason 已提交
9195

9196 9197
	btrfs_debug(root->fs_info, "Drop subvolume %llu", root->objectid);

9198
	path = btrfs_alloc_path();
9199 9200 9201 9202
	if (!path) {
		err = -ENOMEM;
		goto out;
	}
C
Chris Mason 已提交
9203

9204
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
9205 9206
	if (!wc) {
		btrfs_free_path(path);
9207 9208
		err = -ENOMEM;
		goto out;
9209
	}
9210

9211
	trans = btrfs_start_transaction(tree_root, 0);
9212 9213 9214 9215
	if (IS_ERR(trans)) {
		err = PTR_ERR(trans);
		goto out_free;
	}
9216

9217 9218
	if (block_rsv)
		trans->block_rsv = block_rsv;
9219

9220
	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
9221
		level = btrfs_header_level(root->node);
9222 9223
		path->nodes[level] = btrfs_lock_root_node(root);
		btrfs_set_lock_blocking(path->nodes[level]);
9224
		path->slots[level] = 0;
9225
		path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9226 9227
		memset(&wc->update_progress, 0,
		       sizeof(wc->update_progress));
9228 9229
	} else {
		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
9230 9231 9232
		memcpy(&wc->update_progress, &key,
		       sizeof(wc->update_progress));

9233
		level = root_item->drop_level;
9234
		BUG_ON(level == 0);
9235
		path->lowest_level = level;
9236 9237 9238 9239
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		path->lowest_level = 0;
		if (ret < 0) {
			err = ret;
9240
			goto out_end_trans;
9241
		}
Y
Yan, Zheng 已提交
9242
		WARN_ON(ret > 0);
9243

9244 9245 9246 9247
		/*
		 * unlock our path, this is safe because only this
		 * function is allowed to delete this snapshot
		 */
9248
		btrfs_unlock_up_safe(path, 0);
9249 9250 9251 9252 9253

		level = btrfs_header_level(root->node);
		while (1) {
			btrfs_tree_lock(path->nodes[level]);
			btrfs_set_lock_blocking(path->nodes[level]);
9254
			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9255 9256 9257

			ret = btrfs_lookup_extent_info(trans, root,
						path->nodes[level]->start,
9258
						level, 1, &wc->refs[level],
9259
						&wc->flags[level]);
9260 9261 9262 9263
			if (ret < 0) {
				err = ret;
				goto out_end_trans;
			}
9264 9265 9266 9267 9268 9269
			BUG_ON(wc->refs[level] == 0);

			if (level == root_item->drop_level)
				break;

			btrfs_tree_unlock(path->nodes[level]);
9270
			path->locks[level] = 0;
9271 9272 9273
			WARN_ON(wc->refs[level] != 1);
			level--;
		}
9274
	}
9275 9276 9277 9278 9279 9280

	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = update_ref;
	wc->keep_locks = 0;
A
Arne Jansen 已提交
9281
	wc->for_reloc = for_reloc;
Y
Yan, Zheng 已提交
9282
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
9283

C
Chris Mason 已提交
9284
	while (1) {
D
David Sterba 已提交
9285

9286 9287 9288
		ret = walk_down_tree(trans, root, path, wc);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
9289
			break;
9290
		}
C
Chris Mason 已提交
9291

9292 9293 9294
		ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
		if (ret < 0) {
			err = ret;
C
Chris Mason 已提交
9295
			break;
9296 9297 9298 9299
		}

		if (ret > 0) {
			BUG_ON(wc->stage != DROP_REFERENCE);
9300 9301
			break;
		}
9302 9303 9304 9305 9306 9307 9308 9309 9310 9311

		if (wc->stage == DROP_REFERENCE) {
			level = wc->level;
			btrfs_node_key(path->nodes[level],
				       &root_item->drop_progress,
				       path->slots[level]);
			root_item->drop_level = level;
		}

		BUG_ON(wc->level == 0);
9312 9313
		if (btrfs_should_end_transaction(trans, tree_root) ||
		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
9314 9315 9316
			ret = btrfs_update_root(trans, tree_root,
						&root->root_key,
						root_item);
9317
			if (ret) {
9318
				btrfs_abort_transaction(trans, ret);
9319 9320 9321
				err = ret;
				goto out_end_trans;
			}
9322

9323
			btrfs_end_transaction_throttle(trans, tree_root);
9324
			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
9325
				pr_debug("BTRFS: drop snapshot early exit\n");
9326 9327 9328 9329
				err = -EAGAIN;
				goto out_free;
			}

9330
			trans = btrfs_start_transaction(tree_root, 0);
9331 9332 9333 9334
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto out_free;
			}
9335 9336
			if (block_rsv)
				trans->block_rsv = block_rsv;
9337
		}
C
Chris Mason 已提交
9338
	}
9339
	btrfs_release_path(path);
9340 9341
	if (err)
		goto out_end_trans;
9342 9343

	ret = btrfs_del_root(trans, tree_root, &root->root_key);
9344
	if (ret) {
9345
		btrfs_abort_transaction(trans, ret);
9346 9347
		goto out_end_trans;
	}
9348

9349
	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
9350 9351
		ret = btrfs_find_root(tree_root, &root->root_key, path,
				      NULL, NULL);
9352
		if (ret < 0) {
9353
			btrfs_abort_transaction(trans, ret);
9354 9355 9356
			err = ret;
			goto out_end_trans;
		} else if (ret > 0) {
9357 9358 9359 9360 9361 9362 9363
			/* if we fail to delete the orphan item this time
			 * around, it'll get picked up the next time.
			 *
			 * The most common failure here is just -ENOENT.
			 */
			btrfs_del_orphan_item(trans, tree_root,
					      root->root_key.objectid);
9364 9365 9366
		}
	}

9367
	if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
9368
		btrfs_add_dropped_root(trans, root);
9369 9370 9371
	} else {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
9372
		btrfs_put_fs_root(root);
9373
	}
9374
	root_dropped = true;
9375
out_end_trans:
9376
	btrfs_end_transaction_throttle(trans, tree_root);
9377
out_free:
9378
	kfree(wc);
9379
	btrfs_free_path(path);
9380
out:
9381 9382 9383 9384 9385 9386 9387
	/*
	 * So if we need to stop dropping the snapshot for whatever reason we
	 * need to make sure to add it back to the dead root list so that we
	 * keep trying to do the work later.  This also cleans up roots if we
	 * don't have it in the radix (like when we recover after a power fail
	 * or unmount) so we don't leak memory.
	 */
9388
	if (!for_reloc && root_dropped == false)
9389
		btrfs_add_dead_root(root);
9390
	if (err && err != -EAGAIN)
9391
		btrfs_handle_fs_error(root->fs_info, err, NULL);
9392
	return err;
C
Chris Mason 已提交
9393
}
C
Chris Mason 已提交
9394

9395 9396 9397 9398
/*
 * drop subtree rooted at tree block 'node'.
 *
 * NOTE: this function will unlock and release tree block 'node'
A
Arne Jansen 已提交
9399
 * only used by relocation code
9400
 */
Y
Yan Zheng 已提交
9401 9402 9403 9404 9405 9406
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct extent_buffer *node,
			struct extent_buffer *parent)
{
	struct btrfs_path *path;
9407
	struct walk_control *wc;
Y
Yan Zheng 已提交
9408 9409 9410 9411 9412
	int level;
	int parent_level;
	int ret = 0;
	int wret;

9413 9414
	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);

Y
Yan Zheng 已提交
9415
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
9416 9417
	if (!path)
		return -ENOMEM;
Y
Yan Zheng 已提交
9418

9419
	wc = kzalloc(sizeof(*wc), GFP_NOFS);
T
Tsutomu Itoh 已提交
9420 9421 9422 9423
	if (!wc) {
		btrfs_free_path(path);
		return -ENOMEM;
	}
9424

9425
	btrfs_assert_tree_locked(parent);
Y
Yan Zheng 已提交
9426 9427 9428 9429 9430
	parent_level = btrfs_header_level(parent);
	extent_buffer_get(parent);
	path->nodes[parent_level] = parent;
	path->slots[parent_level] = btrfs_header_nritems(parent);

9431
	btrfs_assert_tree_locked(node);
Y
Yan Zheng 已提交
9432 9433 9434
	level = btrfs_header_level(node);
	path->nodes[level] = node;
	path->slots[level] = 0;
9435
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
9436 9437 9438 9439 9440 9441 9442 9443

	wc->refs[parent_level] = 1;
	wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
	wc->level = level;
	wc->shared_level = -1;
	wc->stage = DROP_REFERENCE;
	wc->update_ref = 0;
	wc->keep_locks = 1;
A
Arne Jansen 已提交
9444
	wc->for_reloc = 1;
Y
Yan, Zheng 已提交
9445
	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
Y
Yan Zheng 已提交
9446 9447

	while (1) {
9448 9449
		wret = walk_down_tree(trans, root, path, wc);
		if (wret < 0) {
Y
Yan Zheng 已提交
9450 9451
			ret = wret;
			break;
9452
		}
Y
Yan Zheng 已提交
9453

9454
		wret = walk_up_tree(trans, root, path, wc, parent_level);
Y
Yan Zheng 已提交
9455 9456 9457 9458 9459 9460
		if (wret < 0)
			ret = wret;
		if (wret != 0)
			break;
	}

9461
	kfree(wc);
Y
Yan Zheng 已提交
9462 9463 9464 9465
	btrfs_free_path(path);
	return ret;
}

9466 9467 9468
static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
{
	u64 num_devices;
9469
	u64 stripped;
9470

9471 9472 9473 9474 9475 9476 9477
	/*
	 * if restripe for this chunk_type is on pick target profile and
	 * return, otherwise do the usual balance
	 */
	stripped = get_restripe_target(root->fs_info, flags);
	if (stripped)
		return extended_to_chunk(stripped);
9478

9479
	num_devices = root->fs_info->fs_devices->rw_devices;
9480

9481
	stripped = BTRFS_BLOCK_GROUP_RAID0 |
D
David Woodhouse 已提交
9482
		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
9483 9484
		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;

9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508
	if (num_devices == 1) {
		stripped |= BTRFS_BLOCK_GROUP_DUP;
		stripped = flags & ~stripped;

		/* turn raid0 into single device chunks */
		if (flags & BTRFS_BLOCK_GROUP_RAID0)
			return stripped;

		/* turn mirroring into duplication */
		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
			     BTRFS_BLOCK_GROUP_RAID10))
			return stripped | BTRFS_BLOCK_GROUP_DUP;
	} else {
		/* they already had raid on here, just return */
		if (flags & stripped)
			return flags;

		stripped |= BTRFS_BLOCK_GROUP_DUP;
		stripped = flags & ~stripped;

		/* switch duplicated blocks with raid1 */
		if (flags & BTRFS_BLOCK_GROUP_DUP)
			return stripped | BTRFS_BLOCK_GROUP_RAID1;

9509
		/* this is drive concat, leave it alone */
9510
	}
9511

9512 9513 9514
	return flags;
}

9515
static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
C
Chris Mason 已提交
9516
{
9517 9518
	struct btrfs_space_info *sinfo = cache->space_info;
	u64 num_bytes;
9519
	u64 min_allocable_bytes;
9520
	int ret = -ENOSPC;
C
Chris Mason 已提交
9521

9522 9523 9524 9525 9526 9527 9528 9529
	/*
	 * We need some metadata space and system metadata space for
	 * allocating chunks in some corner cases until we force to set
	 * it to be readonly.
	 */
	if ((sinfo->flags &
	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
	    !force)
9530
		min_allocable_bytes = SZ_1M;
9531 9532 9533
	else
		min_allocable_bytes = 0;

9534 9535
	spin_lock(&sinfo->lock);
	spin_lock(&cache->lock);
9536 9537

	if (cache->ro) {
9538
		cache->ro++;
9539 9540 9541 9542
		ret = 0;
		goto out;
	}

9543 9544 9545 9546
	num_bytes = cache->key.offset - cache->reserved - cache->pinned -
		    cache->bytes_super - btrfs_block_group_used(&cache->item);

	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
9547 9548
	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
	    min_allocable_bytes <= sinfo->total_bytes) {
9549
		sinfo->bytes_readonly += num_bytes;
9550
		cache->ro++;
9551
		list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
9552 9553
		ret = 0;
	}
9554
out:
9555 9556 9557 9558
	spin_unlock(&cache->lock);
	spin_unlock(&sinfo->lock);
	return ret;
}
9559

9560
int btrfs_inc_block_group_ro(struct btrfs_root *root,
9561
			     struct btrfs_block_group_cache *cache)
9562

9563 9564 9565 9566
{
	struct btrfs_trans_handle *trans;
	u64 alloc_flags;
	int ret;
9567

9568
again:
C
Chris Mason 已提交
9569
	trans = btrfs_join_transaction(root);
9570 9571
	if (IS_ERR(trans))
		return PTR_ERR(trans);
9572

9573 9574 9575 9576 9577 9578
	/*
	 * we're not allowed to set block groups readonly after the dirty
	 * block groups cache has started writing.  If it already started,
	 * back off and let this transaction commit
	 */
	mutex_lock(&root->fs_info->ro_block_group_mutex);
9579
	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590
		u64 transid = trans->transid;

		mutex_unlock(&root->fs_info->ro_block_group_mutex);
		btrfs_end_transaction(trans, root);

		ret = btrfs_wait_for_commit(root, transid);
		if (ret)
			return ret;
		goto again;
	}

9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608
	/*
	 * if we are changing raid levels, try to allocate a corresponding
	 * block group with the new raid level.
	 */
	alloc_flags = update_block_group_flags(root, cache->flags);
	if (alloc_flags != cache->flags) {
		ret = do_chunk_alloc(trans, root, alloc_flags,
				     CHUNK_ALLOC_FORCE);
		/*
		 * ENOSPC is allowed here, we may have enough space
		 * already allocated at the new raid level to
		 * carry on
		 */
		if (ret == -ENOSPC)
			ret = 0;
		if (ret < 0)
			goto out;
	}
9609

9610
	ret = inc_block_group_ro(cache, 0);
9611 9612 9613
	if (!ret)
		goto out;
	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
9614
	ret = do_chunk_alloc(trans, root, alloc_flags,
9615
			     CHUNK_ALLOC_FORCE);
9616 9617
	if (ret < 0)
		goto out;
9618
	ret = inc_block_group_ro(cache, 0);
9619
out:
9620 9621
	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
		alloc_flags = update_block_group_flags(root, cache->flags);
9622
		lock_chunks(root->fs_info->chunk_root);
9623
		check_system_chunk(trans, root, alloc_flags);
9624
		unlock_chunks(root->fs_info->chunk_root);
9625
	}
9626
	mutex_unlock(&root->fs_info->ro_block_group_mutex);
9627

9628 9629 9630
	btrfs_end_transaction(trans, root);
	return ret;
}
9631

9632 9633 9634 9635
int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, u64 type)
{
	u64 alloc_flags = get_alloc_profile(root, type);
9636
	return do_chunk_alloc(trans, root, alloc_flags,
9637
			      CHUNK_ALLOC_FORCE);
9638 9639
}

9640 9641
/*
 * helper to account the unused space of all the readonly block group in the
9642
 * space_info. takes mirrors into account.
9643
 */
9644
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
9645 9646 9647 9648 9649
{
	struct btrfs_block_group_cache *block_group;
	u64 free_bytes = 0;
	int factor;

9650
	/* It's df, we don't care if it's racy */
9651 9652 9653 9654 9655
	if (list_empty(&sinfo->ro_bgs))
		return 0;

	spin_lock(&sinfo->lock);
	list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680
		spin_lock(&block_group->lock);

		if (!block_group->ro) {
			spin_unlock(&block_group->lock);
			continue;
		}

		if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
					  BTRFS_BLOCK_GROUP_RAID10 |
					  BTRFS_BLOCK_GROUP_DUP))
			factor = 2;
		else
			factor = 1;

		free_bytes += (block_group->key.offset -
			       btrfs_block_group_used(&block_group->item)) *
			       factor;

		spin_unlock(&block_group->lock);
	}
	spin_unlock(&sinfo->lock);

	return free_bytes;
}

9681
void btrfs_dec_block_group_ro(struct btrfs_root *root,
9682
			      struct btrfs_block_group_cache *cache)
9683
{
9684 9685 9686 9687 9688 9689 9690
	struct btrfs_space_info *sinfo = cache->space_info;
	u64 num_bytes;

	BUG_ON(!cache->ro);

	spin_lock(&sinfo->lock);
	spin_lock(&cache->lock);
9691 9692 9693 9694 9695 9696 9697
	if (!--cache->ro) {
		num_bytes = cache->key.offset - cache->reserved -
			    cache->pinned - cache->bytes_super -
			    btrfs_block_group_used(&cache->item);
		sinfo->bytes_readonly -= num_bytes;
		list_del_init(&cache->ro_list);
	}
9698 9699
	spin_unlock(&cache->lock);
	spin_unlock(&sinfo->lock);
9700 9701
}

9702 9703 9704 9705 9706 9707 9708
/*
 * checks to see if its even possible to relocate this block group.
 *
 * @return - -1 if it's not a good idea to relocate this block group, 0 if its
 * ok to go ahead and try.
 */
int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
Z
Zheng Yan 已提交
9709
{
9710 9711 9712 9713
	struct btrfs_block_group_cache *block_group;
	struct btrfs_space_info *space_info;
	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
	struct btrfs_device *device;
9714
	struct btrfs_trans_handle *trans;
9715
	u64 min_free;
J
Josef Bacik 已提交
9716 9717
	u64 dev_min = 1;
	u64 dev_nr = 0;
9718
	u64 target;
9719
	int debug;
9720
	int index;
9721 9722
	int full = 0;
	int ret = 0;
Z
Zheng Yan 已提交
9723

9724
	debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
9725

9726
	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
Z
Zheng Yan 已提交
9727

9728
	/* odd, couldn't find the block group, leave it alone */
9729 9730 9731 9732 9733
	if (!block_group) {
		if (debug)
			btrfs_warn(root->fs_info,
				   "can't find block group for bytenr %llu",
				   bytenr);
9734
		return -1;
9735
	}
Z
Zheng Yan 已提交
9736

9737 9738
	min_free = btrfs_block_group_used(&block_group->item);

9739
	/* no bytes used, we're good */
9740
	if (!min_free)
Z
Zheng Yan 已提交
9741 9742
		goto out;

9743 9744
	space_info = block_group->space_info;
	spin_lock(&space_info->lock);
9745

9746
	full = space_info->full;
9747

9748 9749
	/*
	 * if this is the last block group we have in this space, we can't
9750 9751 9752 9753
	 * relocate it unless we're able to allocate a new chunk below.
	 *
	 * Otherwise, we need to make sure we have room in the space to handle
	 * all of the extents from this block group.  If we can, we're good
9754
	 */
9755
	if ((space_info->total_bytes != block_group->key.offset) &&
9756 9757 9758
	    (space_info->bytes_used + space_info->bytes_reserved +
	     space_info->bytes_pinned + space_info->bytes_readonly +
	     min_free < space_info->total_bytes)) {
9759 9760
		spin_unlock(&space_info->lock);
		goto out;
9761
	}
9762
	spin_unlock(&space_info->lock);
9763

9764 9765 9766
	/*
	 * ok we don't have enough space, but maybe we have free space on our
	 * devices to allocate new chunks for relocation, so loop through our
9767 9768 9769
	 * alloc devices and guess if we have enough space.  if this block
	 * group is going to be restriped, run checks against the target
	 * profile instead of the current one.
9770 9771
	 */
	ret = -1;
9772

9773 9774 9775 9776 9777 9778 9779 9780
	/*
	 * index:
	 *      0: raid10
	 *      1: raid1
	 *      2: dup
	 *      3: raid0
	 *      4: single
	 */
9781 9782
	target = get_restripe_target(root->fs_info, block_group->flags);
	if (target) {
9783
		index = __get_raid_index(extended_to_chunk(target));
9784 9785 9786 9787 9788
	} else {
		/*
		 * this is just a balance, so if we were marked as full
		 * we know there is no space for a new chunk
		 */
9789 9790 9791 9792 9793
		if (full) {
			if (debug)
				btrfs_warn(root->fs_info,
					"no space to alloc new chunk for block group %llu",
					block_group->key.objectid);
9794
			goto out;
9795
		}
9796 9797 9798 9799

		index = get_block_group_index(block_group);
	}

9800
	if (index == BTRFS_RAID_RAID10) {
9801
		dev_min = 4;
J
Josef Bacik 已提交
9802 9803
		/* Divide by 2 */
		min_free >>= 1;
9804
	} else if (index == BTRFS_RAID_RAID1) {
9805
		dev_min = 2;
9806
	} else if (index == BTRFS_RAID_DUP) {
J
Josef Bacik 已提交
9807 9808
		/* Multiply by 2 */
		min_free <<= 1;
9809
	} else if (index == BTRFS_RAID_RAID0) {
9810
		dev_min = fs_devices->rw_devices;
9811
		min_free = div64_u64(min_free, dev_min);
9812 9813
	}

9814 9815 9816 9817 9818 9819 9820
	/* We need to do this so that we can look at pending chunks */
	trans = btrfs_join_transaction(root);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto out;
	}

9821 9822
	mutex_lock(&root->fs_info->chunk_mutex);
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
9823
		u64 dev_offset;
9824

9825 9826 9827 9828
		/*
		 * check to make sure we can actually find a chunk with enough
		 * space to fit our block group in.
		 */
9829 9830
		if (device->total_bytes > device->bytes_used + min_free &&
		    !device->is_tgtdev_for_dev_replace) {
9831
			ret = find_free_dev_extent(trans, device, min_free,
9832
						   &dev_offset, NULL);
9833
			if (!ret)
9834 9835 9836
				dev_nr++;

			if (dev_nr >= dev_min)
9837
				break;
9838

9839
			ret = -1;
9840
		}
9841
	}
9842 9843 9844 9845
	if (debug && ret == -1)
		btrfs_warn(root->fs_info,
			"no space to allocate a new chunk for block group %llu",
			block_group->key.objectid);
9846
	mutex_unlock(&root->fs_info->chunk_mutex);
9847
	btrfs_end_transaction(trans, root);
9848
out:
9849
	btrfs_put_block_group(block_group);
9850 9851 9852
	return ret;
}

9853 9854
static int find_first_block_group(struct btrfs_root *root,
		struct btrfs_path *path, struct btrfs_key *key)
9855
{
9856
	int ret = 0;
9857 9858 9859
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	int slot;
9860

9861 9862
	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
	if (ret < 0)
9863 9864
		goto out;

C
Chris Mason 已提交
9865
	while (1) {
9866
		slot = path->slots[0];
9867
		leaf = path->nodes[0];
9868 9869 9870 9871 9872
		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
9873
				goto out;
9874
			break;
9875
		}
9876
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
9877

9878
		if (found_key.objectid >= key->objectid &&
9879
		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895
			struct extent_map_tree *em_tree;
			struct extent_map *em;

			em_tree = &root->fs_info->mapping_tree.map_tree;
			read_lock(&em_tree->lock);
			em = lookup_extent_mapping(em_tree, found_key.objectid,
						   found_key.offset);
			read_unlock(&em_tree->lock);
			if (!em) {
				btrfs_err(root->fs_info,
			"logical %llu len %llu found bg but no related chunk",
					  found_key.objectid, found_key.offset);
				ret = -ENOENT;
			} else {
				ret = 0;
			}
9896
			free_extent_map(em);
9897 9898
			goto out;
		}
9899
		path->slots[0]++;
9900
	}
9901
out:
9902
	return ret;
9903 9904
}

9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932
void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
{
	struct btrfs_block_group_cache *block_group;
	u64 last = 0;

	while (1) {
		struct inode *inode;

		block_group = btrfs_lookup_first_block_group(info, last);
		while (block_group) {
			spin_lock(&block_group->lock);
			if (block_group->iref)
				break;
			spin_unlock(&block_group->lock);
			block_group = next_block_group(info->tree_root,
						       block_group);
		}
		if (!block_group) {
			if (last == 0)
				break;
			last = 0;
			continue;
		}

		inode = block_group->inode;
		block_group->iref = 0;
		block_group->inode = NULL;
		spin_unlock(&block_group->lock);
9933
		ASSERT(block_group->io_ctl.inode == NULL);
9934 9935 9936 9937 9938 9939
		iput(inode);
		last = block_group->key.objectid + block_group->key.offset;
		btrfs_put_block_group(block_group);
	}
}

Z
Zheng Yan 已提交
9940 9941 9942
int btrfs_free_block_groups(struct btrfs_fs_info *info)
{
	struct btrfs_block_group_cache *block_group;
9943
	struct btrfs_space_info *space_info;
9944
	struct btrfs_caching_control *caching_ctl;
Z
Zheng Yan 已提交
9945 9946
	struct rb_node *n;

9947
	down_write(&info->commit_root_sem);
9948 9949 9950 9951 9952 9953
	while (!list_empty(&info->caching_block_groups)) {
		caching_ctl = list_entry(info->caching_block_groups.next,
					 struct btrfs_caching_control, list);
		list_del(&caching_ctl->list);
		put_caching_control(caching_ctl);
	}
9954
	up_write(&info->commit_root_sem);
9955

9956 9957 9958 9959 9960 9961 9962 9963 9964 9965
	spin_lock(&info->unused_bgs_lock);
	while (!list_empty(&info->unused_bgs)) {
		block_group = list_first_entry(&info->unused_bgs,
					       struct btrfs_block_group_cache,
					       bg_list);
		list_del_init(&block_group->bg_list);
		btrfs_put_block_group(block_group);
	}
	spin_unlock(&info->unused_bgs_lock);

Z
Zheng Yan 已提交
9966 9967 9968 9969 9970 9971
	spin_lock(&info->block_group_cache_lock);
	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
		block_group = rb_entry(n, struct btrfs_block_group_cache,
				       cache_node);
		rb_erase(&block_group->cache_node,
			 &info->block_group_cache_tree);
9972
		RB_CLEAR_NODE(&block_group->cache_node);
Y
Yan Zheng 已提交
9973 9974
		spin_unlock(&info->block_group_cache_lock);

9975
		down_write(&block_group->space_info->groups_sem);
Z
Zheng Yan 已提交
9976
		list_del(&block_group->list);
9977
		up_write(&block_group->space_info->groups_sem);
9978

J
Josef Bacik 已提交
9979
		if (block_group->cached == BTRFS_CACHE_STARTED)
9980
			wait_block_group_cache_done(block_group);
J
Josef Bacik 已提交
9981

9982 9983 9984 9985
		/*
		 * We haven't cached this block group, which means we could
		 * possibly have excluded extents on this block group.
		 */
9986 9987
		if (block_group->cached == BTRFS_CACHE_NO ||
		    block_group->cached == BTRFS_CACHE_ERROR)
9988 9989
			free_excluded_extents(info->extent_root, block_group);

J
Josef Bacik 已提交
9990
		btrfs_remove_free_space_cache(block_group);
9991 9992 9993 9994
		ASSERT(list_empty(&block_group->dirty_list));
		ASSERT(list_empty(&block_group->io_list));
		ASSERT(list_empty(&block_group->bg_list));
		ASSERT(atomic_read(&block_group->count) == 1);
9995
		btrfs_put_block_group(block_group);
Y
Yan Zheng 已提交
9996 9997

		spin_lock(&info->block_group_cache_lock);
Z
Zheng Yan 已提交
9998 9999
	}
	spin_unlock(&info->block_group_cache_lock);
10000 10001 10002 10003 10004 10005 10006 10007 10008

	/* now that all the block groups are freed, go through and
	 * free all the space_info structs.  This is only called during
	 * the final stages of unmount, and so we know nobody is
	 * using them.  We call synchronize_rcu() once before we start,
	 * just to be on the safe side.
	 */
	synchronize_rcu();

10009 10010
	release_global_block_rsv(info);

10011
	while (!list_empty(&info->space_info)) {
10012 10013
		int i;

10014 10015 10016
		space_info = list_entry(info->space_info.next,
					struct btrfs_space_info,
					list);
10017 10018 10019 10020 10021 10022

		/*
		 * Do not hide this behind enospc_debug, this is actually
		 * important and indicates a real bug if this happens.
		 */
		if (WARN_ON(space_info->bytes_pinned > 0 ||
10023
			    space_info->bytes_reserved > 0 ||
10024 10025
			    space_info->bytes_may_use > 0))
			dump_space_info(space_info, 0, 0);
10026
		list_del(&space_info->list);
10027 10028
		for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
			struct kobject *kobj;
10029 10030 10031
			kobj = space_info->block_group_kobjs[i];
			space_info->block_group_kobjs[i] = NULL;
			if (kobj) {
10032 10033 10034 10035 10036 10037
				kobject_del(kobj);
				kobject_put(kobj);
			}
		}
		kobject_del(&space_info->kobj);
		kobject_put(&space_info->kobj);
10038
	}
Z
Zheng Yan 已提交
10039 10040 10041
	return 0;
}

10042 10043 10044 10045
static void __link_block_group(struct btrfs_space_info *space_info,
			       struct btrfs_block_group_cache *cache)
{
	int index = get_block_group_index(cache);
10046
	bool first = false;
10047 10048

	down_write(&space_info->groups_sem);
10049 10050 10051 10052 10053 10054
	if (list_empty(&space_info->block_groups[index]))
		first = true;
	list_add_tail(&cache->list, &space_info->block_groups[index]);
	up_write(&space_info->groups_sem);

	if (first) {
10055
		struct raid_kobject *rkobj;
10056 10057
		int ret;

10058 10059 10060 10061 10062 10063 10064
		rkobj = kzalloc(sizeof(*rkobj), GFP_NOFS);
		if (!rkobj)
			goto out_err;
		rkobj->raid_type = index;
		kobject_init(&rkobj->kobj, &btrfs_raid_ktype);
		ret = kobject_add(&rkobj->kobj, &space_info->kobj,
				  "%s", get_raid_name(index));
10065
		if (ret) {
10066 10067
			kobject_put(&rkobj->kobj);
			goto out_err;
10068
		}
10069
		space_info->block_group_kobjs[index] = &rkobj->kobj;
10070
	}
10071 10072 10073 10074

	return;
out_err:
	pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n");
10075 10076
}

10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101
static struct btrfs_block_group_cache *
btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
{
	struct btrfs_block_group_cache *cache;

	cache = kzalloc(sizeof(*cache), GFP_NOFS);
	if (!cache)
		return NULL;

	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
					GFP_NOFS);
	if (!cache->free_space_ctl) {
		kfree(cache);
		return NULL;
	}

	cache->key.objectid = start;
	cache->key.offset = size;
	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;

	cache->sectorsize = root->sectorsize;
	cache->fs_info = root->fs_info;
	cache->full_stripe_len = btrfs_full_stripe_len(root,
					       &root->fs_info->mapping_tree,
					       start);
10102 10103
	set_free_space_tree_thresholds(cache);

10104 10105
	atomic_set(&cache->count, 1);
	spin_lock_init(&cache->lock);
10106
	init_rwsem(&cache->data_rwsem);
10107 10108
	INIT_LIST_HEAD(&cache->list);
	INIT_LIST_HEAD(&cache->cluster_list);
10109
	INIT_LIST_HEAD(&cache->bg_list);
10110
	INIT_LIST_HEAD(&cache->ro_list);
10111
	INIT_LIST_HEAD(&cache->dirty_list);
10112
	INIT_LIST_HEAD(&cache->io_list);
10113
	btrfs_init_free_space_ctl(cache);
10114
	atomic_set(&cache->trimming, 0);
10115
	mutex_init(&cache->free_space_lock);
10116 10117 10118 10119

	return cache;
}

C
Chris Mason 已提交
10120 10121 10122 10123 10124
int btrfs_read_block_groups(struct btrfs_root *root)
{
	struct btrfs_path *path;
	int ret;
	struct btrfs_block_group_cache *cache;
C
Chris Mason 已提交
10125
	struct btrfs_fs_info *info = root->fs_info;
10126
	struct btrfs_space_info *space_info;
C
Chris Mason 已提交
10127 10128
	struct btrfs_key key;
	struct btrfs_key found_key;
10129
	struct extent_buffer *leaf;
10130 10131
	int need_clear = 0;
	u64 cache_gen;
10132

C
Chris Mason 已提交
10133
	root = info->extent_root;
C
Chris Mason 已提交
10134
	key.objectid = 0;
10135
	key.offset = 0;
10136
	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
C
Chris Mason 已提交
10137 10138 10139
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
10140
	path->reada = READA_FORWARD;
C
Chris Mason 已提交
10141

10142
	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
10143
	if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
10144
	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
10145
		need_clear = 1;
10146
	if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
10147
		need_clear = 1;
10148

C
Chris Mason 已提交
10149
	while (1) {
10150
		ret = find_first_block_group(root, path, &key);
10151 10152
		if (ret > 0)
			break;
10153 10154
		if (ret != 0)
			goto error;
10155

10156 10157
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
10158 10159 10160

		cache = btrfs_create_block_group_cache(root, found_key.objectid,
						       found_key.offset);
C
Chris Mason 已提交
10161
		if (!cache) {
10162
			ret = -ENOMEM;
10163
			goto error;
C
Chris Mason 已提交
10164
		}
10165

10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176
		if (need_clear) {
			/*
			 * When we mount with old space cache, we need to
			 * set BTRFS_DC_CLEAR and set dirty flag.
			 *
			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
			 *    truncate the old free space cache inode and
			 *    setup a new one.
			 * b) Setting 'dirty flag' makes sure that we flush
			 *    the new space cache info onto disk.
			 */
10177
			if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
10178
				cache->disk_cache_state = BTRFS_DC_CLEAR;
10179
		}
10180

10181 10182 10183
		read_extent_buffer(leaf, &cache->item,
				   btrfs_item_ptr_offset(leaf, path->slots[0]),
				   sizeof(cache->item));
10184
		cache->flags = btrfs_block_group_flags(&cache->item);
10185

C
Chris Mason 已提交
10186
		key.objectid = found_key.objectid + found_key.offset;
10187
		btrfs_release_path(path);
10188

10189 10190 10191 10192 10193
		/*
		 * We need to exclude the super stripes now so that the space
		 * info has super bytes accounted for, otherwise we'll think
		 * we have more space than we actually do.
		 */
10194 10195 10196 10197 10198 10199 10200
		ret = exclude_super_stripes(root, cache);
		if (ret) {
			/*
			 * We may have excluded something, so call this just in
			 * case.
			 */
			free_excluded_extents(root, cache);
10201
			btrfs_put_block_group(cache);
10202 10203
			goto error;
		}
10204

J
Josef Bacik 已提交
10205 10206 10207 10208 10209 10210 10211 10212
		/*
		 * check for two cases, either we are full, and therefore
		 * don't need to bother with the caching work since we won't
		 * find any space, or we are empty, and we can just add all
		 * the space in and be done with it.  This saves us _alot_ of
		 * time, particularly in the full case.
		 */
		if (found_key.offset == btrfs_block_group_used(&cache->item)) {
10213
			cache->last_byte_to_unpin = (u64)-1;
J
Josef Bacik 已提交
10214
			cache->cached = BTRFS_CACHE_FINISHED;
10215
			free_excluded_extents(root, cache);
J
Josef Bacik 已提交
10216
		} else if (btrfs_block_group_used(&cache->item) == 0) {
10217
			cache->last_byte_to_unpin = (u64)-1;
J
Josef Bacik 已提交
10218 10219 10220 10221 10222
			cache->cached = BTRFS_CACHE_FINISHED;
			add_new_free_space(cache, root->fs_info,
					   found_key.objectid,
					   found_key.objectid +
					   found_key.offset);
10223
			free_excluded_extents(root, cache);
J
Josef Bacik 已提交
10224
		}
10225

10226 10227 10228 10229 10230 10231 10232
		ret = btrfs_add_block_group_cache(root->fs_info, cache);
		if (ret) {
			btrfs_remove_free_space_cache(cache);
			btrfs_put_block_group(cache);
			goto error;
		}

10233
		trace_btrfs_add_block_group(root->fs_info, cache, 0);
10234 10235
		ret = update_space_info(info, cache->flags, found_key.offset,
					btrfs_block_group_used(&cache->item),
10236
					cache->bytes_super, &space_info);
10237 10238 10239 10240 10241
		if (ret) {
			btrfs_remove_free_space_cache(cache);
			spin_lock(&info->block_group_cache_lock);
			rb_erase(&cache->cache_node,
				 &info->block_group_cache_tree);
10242
			RB_CLEAR_NODE(&cache->cache_node);
10243 10244 10245 10246 10247
			spin_unlock(&info->block_group_cache_lock);
			btrfs_put_block_group(cache);
			goto error;
		}

10248
		cache->space_info = space_info;
10249

10250
		__link_block_group(space_info, cache);
J
Josef Bacik 已提交
10251

10252
		set_avail_alloc_bits(root->fs_info, cache->flags);
10253
		if (btrfs_chunk_readonly(root, cache->key.objectid)) {
10254
			inc_block_group_ro(cache, 1);
10255 10256 10257 10258 10259 10260 10261 10262 10263 10264
		} else if (btrfs_block_group_used(&cache->item) == 0) {
			spin_lock(&info->unused_bgs_lock);
			/* Should always be true but just in case. */
			if (list_empty(&cache->bg_list)) {
				btrfs_get_block_group(cache);
				list_add_tail(&cache->bg_list,
					      &info->unused_bgs);
			}
			spin_unlock(&info->unused_bgs_lock);
		}
C
Chris Mason 已提交
10265
	}
10266 10267 10268 10269 10270

	list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
		if (!(get_alloc_profile(root, space_info->flags) &
		      (BTRFS_BLOCK_GROUP_RAID10 |
		       BTRFS_BLOCK_GROUP_RAID1 |
D
David Woodhouse 已提交
10271 10272
		       BTRFS_BLOCK_GROUP_RAID5 |
		       BTRFS_BLOCK_GROUP_RAID6 |
10273 10274 10275 10276 10277 10278
		       BTRFS_BLOCK_GROUP_DUP)))
			continue;
		/*
		 * avoid allocating from un-mirrored block group if there are
		 * mirrored block groups.
		 */
10279 10280 10281
		list_for_each_entry(cache,
				&space_info->block_groups[BTRFS_RAID_RAID0],
				list)
10282
			inc_block_group_ro(cache, 1);
10283 10284 10285
		list_for_each_entry(cache,
				&space_info->block_groups[BTRFS_RAID_SINGLE],
				list)
10286
			inc_block_group_ro(cache, 1);
C
Chris Mason 已提交
10287
	}
10288 10289

	init_global_block_rsv(info);
10290 10291
	ret = 0;
error:
C
Chris Mason 已提交
10292
	btrfs_free_path(path);
10293
	return ret;
C
Chris Mason 已提交
10294
}
10295

10296 10297 10298 10299 10300 10301 10302 10303
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root)
{
	struct btrfs_block_group_cache *block_group, *tmp;
	struct btrfs_root *extent_root = root->fs_info->extent_root;
	struct btrfs_block_group_item item;
	struct btrfs_key key;
	int ret = 0;
10304
	bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
10305

10306
	trans->can_flush_pending_bgs = false;
10307
	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
10308
		if (ret)
10309
			goto next;
10310 10311 10312 10313 10314 10315 10316 10317 10318

		spin_lock(&block_group->lock);
		memcpy(&item, &block_group->item, sizeof(item));
		memcpy(&key, &block_group->key, sizeof(key));
		spin_unlock(&block_group->lock);

		ret = btrfs_insert_item(trans, extent_root, &key, &item,
					sizeof(item));
		if (ret)
10319
			btrfs_abort_transaction(trans, ret);
10320 10321 10322
		ret = btrfs_finish_chunk_alloc(trans, extent_root,
					       key.objectid, key.offset);
		if (ret)
10323
			btrfs_abort_transaction(trans, ret);
10324 10325
		add_block_group_free_space(trans, root->fs_info, block_group);
		/* already aborted the transaction if it failed. */
10326 10327
next:
		list_del_init(&block_group->bg_list);
10328
	}
10329
	trans->can_flush_pending_bgs = can_flush_pending_bgs;
10330 10331
}

10332 10333
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
			   struct btrfs_root *root, u64 bytes_used,
10334
			   u64 type, u64 chunk_objectid, u64 chunk_offset,
10335 10336 10337 10338 10339 10340 10341
			   u64 size)
{
	int ret;
	struct btrfs_root *extent_root;
	struct btrfs_block_group_cache *cache;
	extent_root = root->fs_info->extent_root;

10342
	btrfs_set_log_full_commit(root->fs_info, trans);
10343

10344
	cache = btrfs_create_block_group_cache(root, chunk_offset, size);
J
Josef Bacik 已提交
10345 10346
	if (!cache)
		return -ENOMEM;
10347

10348 10349 10350 10351
	btrfs_set_block_group_used(&cache->item, bytes_used);
	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
	btrfs_set_block_group_flags(&cache->item, type);

10352
	cache->flags = type;
10353
	cache->last_byte_to_unpin = (u64)-1;
J
Josef Bacik 已提交
10354
	cache->cached = BTRFS_CACHE_FINISHED;
10355
	cache->needs_free_space = 1;
10356 10357 10358 10359 10360 10361 10362
	ret = exclude_super_stripes(root, cache);
	if (ret) {
		/*
		 * We may have excluded something, so call this just in
		 * case.
		 */
		free_excluded_extents(root, cache);
10363
		btrfs_put_block_group(cache);
10364 10365
		return ret;
	}
10366

J
Josef Bacik 已提交
10367 10368 10369
	add_new_free_space(cache, root->fs_info, chunk_offset,
			   chunk_offset + size);

10370 10371
	free_excluded_extents(root, cache);

10372 10373 10374 10375 10376 10377 10378 10379
#ifdef CONFIG_BTRFS_DEBUG
	if (btrfs_should_fragment_free_space(root, cache)) {
		u64 new_bytes_used = size - bytes_used;

		bytes_used += new_bytes_used >> 1;
		fragment_free_space(root, cache);
	}
#endif
10380 10381 10382 10383 10384
	/*
	 * Call to ensure the corresponding space_info object is created and
	 * assigned to our block group, but don't update its counters just yet.
	 * We want our bg to be added to the rbtree with its ->space_info set.
	 */
10385
	ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
10386 10387 10388 10389 10390 10391 10392
				&cache->space_info);
	if (ret) {
		btrfs_remove_free_space_cache(cache);
		btrfs_put_block_group(cache);
		return ret;
	}

10393 10394 10395 10396 10397 10398 10399
	ret = btrfs_add_block_group_cache(root->fs_info, cache);
	if (ret) {
		btrfs_remove_free_space_cache(cache);
		btrfs_put_block_group(cache);
		return ret;
	}

10400 10401 10402 10403
	/*
	 * Now that our block group has its ->space_info set and is inserted in
	 * the rbtree, update the space info's counters.
	 */
10404
	trace_btrfs_add_block_group(root->fs_info, cache, 1);
10405
	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
10406
				cache->bytes_super, &cache->space_info);
10407 10408 10409 10410 10411
	if (ret) {
		btrfs_remove_free_space_cache(cache);
		spin_lock(&root->fs_info->block_group_cache_lock);
		rb_erase(&cache->cache_node,
			 &root->fs_info->block_group_cache_tree);
10412
		RB_CLEAR_NODE(&cache->cache_node);
10413 10414 10415 10416
		spin_unlock(&root->fs_info->block_group_cache_lock);
		btrfs_put_block_group(cache);
		return ret;
	}
10417
	update_global_block_rsv(root->fs_info);
10418

10419
	__link_block_group(cache->space_info, cache);
10420

10421
	list_add_tail(&cache->bg_list, &trans->new_bgs);
10422

C
Chris Mason 已提交
10423
	set_avail_alloc_bits(extent_root->fs_info, type);
10424 10425
	return 0;
}
Z
Zheng Yan 已提交
10426

10427 10428
static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
{
10429 10430
	u64 extra_flags = chunk_to_extended(flags) &
				BTRFS_EXTENDED_PROFILE_MASK;
10431

10432
	write_seqlock(&fs_info->profiles_lock);
10433 10434 10435 10436 10437 10438
	if (flags & BTRFS_BLOCK_GROUP_DATA)
		fs_info->avail_data_alloc_bits &= ~extra_flags;
	if (flags & BTRFS_BLOCK_GROUP_METADATA)
		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
		fs_info->avail_system_alloc_bits &= ~extra_flags;
10439
	write_sequnlock(&fs_info->profiles_lock);
10440 10441
}

Z
Zheng Yan 已提交
10442
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
10443 10444
			     struct btrfs_root *root, u64 group_start,
			     struct extent_map *em)
Z
Zheng Yan 已提交
10445 10446 10447
{
	struct btrfs_path *path;
	struct btrfs_block_group_cache *block_group;
10448
	struct btrfs_free_cluster *cluster;
10449
	struct btrfs_root *tree_root = root->fs_info->tree_root;
Z
Zheng Yan 已提交
10450
	struct btrfs_key key;
10451
	struct inode *inode;
10452
	struct kobject *kobj = NULL;
Z
Zheng Yan 已提交
10453
	int ret;
10454
	int index;
J
Josef Bacik 已提交
10455
	int factor;
10456
	struct btrfs_caching_control *caching_ctl = NULL;
10457
	bool remove_em;
Z
Zheng Yan 已提交
10458 10459 10460 10461 10462

	root = root->fs_info->extent_root;

	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
	BUG_ON(!block_group);
Y
Yan Zheng 已提交
10463
	BUG_ON(!block_group->ro);
Z
Zheng Yan 已提交
10464

10465 10466 10467 10468 10469 10470
	/*
	 * Free the reserved super bytes from this block group before
	 * remove it.
	 */
	free_excluded_extents(root, block_group);

Z
Zheng Yan 已提交
10471
	memcpy(&key, &block_group->key, sizeof(key));
10472
	index = get_block_group_index(block_group);
J
Josef Bacik 已提交
10473 10474 10475 10476 10477 10478
	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
				  BTRFS_BLOCK_GROUP_RAID1 |
				  BTRFS_BLOCK_GROUP_RAID10))
		factor = 2;
	else
		factor = 1;
Z
Zheng Yan 已提交
10479

10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494
	/* make sure this block group isn't part of an allocation cluster */
	cluster = &root->fs_info->data_alloc_cluster;
	spin_lock(&cluster->refill_lock);
	btrfs_return_cluster_to_free_space(block_group, cluster);
	spin_unlock(&cluster->refill_lock);

	/*
	 * make sure this block group isn't part of a metadata
	 * allocation cluster
	 */
	cluster = &root->fs_info->meta_alloc_cluster;
	spin_lock(&cluster->refill_lock);
	btrfs_return_cluster_to_free_space(block_group, cluster);
	spin_unlock(&cluster->refill_lock);

Z
Zheng Yan 已提交
10495
	path = btrfs_alloc_path();
10496 10497 10498 10499
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
Z
Zheng Yan 已提交
10500

10501 10502 10503 10504
	/*
	 * get the inode first so any iput calls done for the io_list
	 * aren't the final iput (no unlinks allowed now)
	 */
10505
	inode = lookup_free_space_inode(tree_root, block_group, path);
10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532

	mutex_lock(&trans->transaction->cache_write_mutex);
	/*
	 * make sure our free spache cache IO is done before remove the
	 * free space inode
	 */
	spin_lock(&trans->transaction->dirty_bgs_lock);
	if (!list_empty(&block_group->io_list)) {
		list_del_init(&block_group->io_list);

		WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);

		spin_unlock(&trans->transaction->dirty_bgs_lock);
		btrfs_wait_cache_io(root, trans, block_group,
				    &block_group->io_ctl, path,
				    block_group->key.objectid);
		btrfs_put_block_group(block_group);
		spin_lock(&trans->transaction->dirty_bgs_lock);
	}

	if (!list_empty(&block_group->dirty_list)) {
		list_del_init(&block_group->dirty_list);
		btrfs_put_block_group(block_group);
	}
	spin_unlock(&trans->transaction->dirty_bgs_lock);
	mutex_unlock(&trans->transaction->cache_write_mutex);

10533
	if (!IS_ERR(inode)) {
10534
		ret = btrfs_orphan_add(trans, inode);
10535 10536 10537 10538
		if (ret) {
			btrfs_add_delayed_iput(inode);
			goto out;
		}
10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550
		clear_nlink(inode);
		/* One for the block groups ref */
		spin_lock(&block_group->lock);
		if (block_group->iref) {
			block_group->iref = 0;
			block_group->inode = NULL;
			spin_unlock(&block_group->lock);
			iput(inode);
		} else {
			spin_unlock(&block_group->lock);
		}
		/* One for our lookup ref */
10551
		btrfs_add_delayed_iput(inode);
10552 10553 10554 10555 10556 10557 10558 10559 10560 10561
	}

	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
	key.offset = block_group->key.objectid;
	key.type = 0;

	ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0)
10562
		btrfs_release_path(path);
10563 10564 10565 10566
	if (ret == 0) {
		ret = btrfs_del_item(trans, tree_root, path);
		if (ret)
			goto out;
10567
		btrfs_release_path(path);
10568 10569
	}

10570
	spin_lock(&root->fs_info->block_group_cache_lock);
Z
Zheng Yan 已提交
10571 10572
	rb_erase(&block_group->cache_node,
		 &root->fs_info->block_group_cache_tree);
10573
	RB_CLEAR_NODE(&block_group->cache_node);
10574 10575 10576

	if (root->fs_info->first_logical_byte == block_group->key.objectid)
		root->fs_info->first_logical_byte = (u64)-1;
10577
	spin_unlock(&root->fs_info->block_group_cache_lock);
J
Josef Bacik 已提交
10578

10579
	down_write(&block_group->space_info->groups_sem);
10580 10581 10582 10583 10584
	/*
	 * we must use list_del_init so people can check to see if they
	 * are still on the list after taking the semaphore
	 */
	list_del_init(&block_group->list);
10585
	if (list_empty(&block_group->space_info->block_groups[index])) {
10586 10587
		kobj = block_group->space_info->block_group_kobjs[index];
		block_group->space_info->block_group_kobjs[index] = NULL;
10588
		clear_avail_alloc_bits(root->fs_info, block_group->flags);
10589
	}
10590
	up_write(&block_group->space_info->groups_sem);
10591 10592 10593 10594
	if (kobj) {
		kobject_del(kobj);
		kobject_put(kobj);
	}
Z
Zheng Yan 已提交
10595

10596 10597
	if (block_group->has_caching_ctl)
		caching_ctl = get_caching_control(block_group);
J
Josef Bacik 已提交
10598
	if (block_group->cached == BTRFS_CACHE_STARTED)
10599
		wait_block_group_cache_done(block_group);
10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621
	if (block_group->has_caching_ctl) {
		down_write(&root->fs_info->commit_root_sem);
		if (!caching_ctl) {
			struct btrfs_caching_control *ctl;

			list_for_each_entry(ctl,
				    &root->fs_info->caching_block_groups, list)
				if (ctl->block_group == block_group) {
					caching_ctl = ctl;
					atomic_inc(&caching_ctl->count);
					break;
				}
		}
		if (caching_ctl)
			list_del_init(&caching_ctl->list);
		up_write(&root->fs_info->commit_root_sem);
		if (caching_ctl) {
			/* Once for the caching bgs list and once for us. */
			put_caching_control(caching_ctl);
			put_caching_control(caching_ctl);
		}
	}
J
Josef Bacik 已提交
10622

10623 10624
	spin_lock(&trans->transaction->dirty_bgs_lock);
	if (!list_empty(&block_group->dirty_list)) {
10625 10626 10627 10628
		WARN_ON(1);
	}
	if (!list_empty(&block_group->io_list)) {
		WARN_ON(1);
10629 10630
	}
	spin_unlock(&trans->transaction->dirty_bgs_lock);
J
Josef Bacik 已提交
10631 10632
	btrfs_remove_free_space_cache(block_group);

Y
Yan Zheng 已提交
10633
	spin_lock(&block_group->space_info->lock);
10634
	list_del_init(&block_group->ro_list);
10635

10636
	if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
10637 10638 10639 10640 10641 10642 10643
		WARN_ON(block_group->space_info->total_bytes
			< block_group->key.offset);
		WARN_ON(block_group->space_info->bytes_readonly
			< block_group->key.offset);
		WARN_ON(block_group->space_info->disk_total
			< block_group->key.offset * factor);
	}
Y
Yan Zheng 已提交
10644 10645
	block_group->space_info->total_bytes -= block_group->key.offset;
	block_group->space_info->bytes_readonly -= block_group->key.offset;
J
Josef Bacik 已提交
10646
	block_group->space_info->disk_total -= block_group->key.offset * factor;
10647

Y
Yan Zheng 已提交
10648
	spin_unlock(&block_group->space_info->lock);
10649

10650 10651
	memcpy(&key, &block_group->key, sizeof(key));

10652
	lock_chunks(root);
10653 10654 10655 10656
	if (!list_empty(&em->list)) {
		/* We're in the transaction->pending_chunks list. */
		free_extent_map(em);
	}
10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675
	spin_lock(&block_group->lock);
	block_group->removed = 1;
	/*
	 * At this point trimming can't start on this block group, because we
	 * removed the block group from the tree fs_info->block_group_cache_tree
	 * so no one can't find it anymore and even if someone already got this
	 * block group before we removed it from the rbtree, they have already
	 * incremented block_group->trimming - if they didn't, they won't find
	 * any free space entries because we already removed them all when we
	 * called btrfs_remove_free_space_cache().
	 *
	 * And we must not remove the extent map from the fs_info->mapping_tree
	 * to prevent the same logical address range and physical device space
	 * ranges from being reused for a new block group. This is because our
	 * fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
	 * completely transactionless, so while it is trimming a range the
	 * currently running transaction might finish and a new one start,
	 * allowing for new block groups to be created that can reuse the same
	 * physical device locations unless we take this special care.
10676 10677 10678 10679 10680
	 *
	 * There may also be an implicit trim operation if the file system
	 * is mounted with -odiscard. The same protections must remain
	 * in place until the extents have been discarded completely when
	 * the transaction commit has completed.
10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708
	 */
	remove_em = (atomic_read(&block_group->trimming) == 0);
	/*
	 * Make sure a trimmer task always sees the em in the pinned_chunks list
	 * if it sees block_group->removed == 1 (needs to lock block_group->lock
	 * before checking block_group->removed).
	 */
	if (!remove_em) {
		/*
		 * Our em might be in trans->transaction->pending_chunks which
		 * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
		 * and so is the fs_info->pinned_chunks list.
		 *
		 * So at this point we must be holding the chunk_mutex to avoid
		 * any races with chunk allocation (more specifically at
		 * volumes.c:contains_pending_extent()), to ensure it always
		 * sees the em, either in the pending_chunks list or in the
		 * pinned_chunks list.
		 */
		list_move_tail(&em->list, &root->fs_info->pinned_chunks);
	}
	spin_unlock(&block_group->lock);

	if (remove_em) {
		struct extent_map_tree *em_tree;

		em_tree = &root->fs_info->mapping_tree.map_tree;
		write_lock(&em_tree->lock);
10709 10710 10711 10712 10713
		/*
		 * The em might be in the pending_chunks list, so make sure the
		 * chunk mutex is locked, since remove_extent_mapping() will
		 * delete us from that list.
		 */
10714 10715 10716 10717 10718 10719
		remove_extent_mapping(em_tree, em);
		write_unlock(&em_tree->lock);
		/* once for the tree */
		free_extent_map(em);
	}

10720 10721
	unlock_chunks(root);

10722 10723 10724 10725
	ret = remove_block_group_free_space(trans, root->fs_info, block_group);
	if (ret)
		goto out;

10726 10727
	btrfs_put_block_group(block_group);
	btrfs_put_block_group(block_group);
Z
Zheng Yan 已提交
10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret > 0)
		ret = -EIO;
	if (ret < 0)
		goto out;

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
	return ret;
}
L
liubo 已提交
10740

10741
struct btrfs_trans_handle *
10742 10743
btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
				     const u64 chunk_offset)
10744
{
10745 10746 10747 10748 10749 10750 10751 10752 10753 10754
	struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
	struct extent_map *em;
	struct map_lookup *map;
	unsigned int num_items;

	read_lock(&em_tree->lock);
	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
	read_unlock(&em_tree->lock);
	ASSERT(em && em->start == chunk_offset);

10755
	/*
10756 10757 10758 10759
	 * We need to reserve 3 + N units from the metadata space info in order
	 * to remove a block group (done at btrfs_remove_chunk() and at
	 * btrfs_remove_block_group()), which are used for:
	 *
10760 10761
	 * 1 unit for adding the free space inode's orphan (located in the tree
	 * of tree roots).
10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772
	 * 1 unit for deleting the block group item (located in the extent
	 * tree).
	 * 1 unit for deleting the free space item (located in tree of tree
	 * roots).
	 * N units for deleting N device extent items corresponding to each
	 * stripe (located in the device tree).
	 *
	 * In order to remove a block group we also need to reserve units in the
	 * system space info in order to update the chunk tree (update one or
	 * more device items and remove one chunk item), but this is done at
	 * btrfs_remove_chunk() through a call to check_system_chunk().
10773
	 */
10774
	map = em->map_lookup;
10775 10776 10777
	num_items = 3 + map->num_stripes;
	free_extent_map(em);

10778
	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
10779
							   num_items, 1);
10780 10781
}

10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799
/*
 * Process the unused_bgs list and remove any that don't have any allocated
 * space inside of them.
 */
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
	struct btrfs_block_group_cache *block_group;
	struct btrfs_space_info *space_info;
	struct btrfs_root *root = fs_info->extent_root;
	struct btrfs_trans_handle *trans;
	int ret = 0;

	if (!fs_info->open)
		return;

	spin_lock(&fs_info->unused_bgs_lock);
	while (!list_empty(&fs_info->unused_bgs)) {
		u64 start, end;
10800
		int trimming;
10801 10802 10803 10804 10805

		block_group = list_first_entry(&fs_info->unused_bgs,
					       struct btrfs_block_group_cache,
					       bg_list);
		list_del_init(&block_group->bg_list);
10806 10807 10808

		space_info = block_group->space_info;

10809 10810 10811 10812 10813 10814
		if (ret || btrfs_mixed_space_info(space_info)) {
			btrfs_put_block_group(block_group);
			continue;
		}
		spin_unlock(&fs_info->unused_bgs_lock);

10815
		mutex_lock(&fs_info->delete_unused_bgs_mutex);
10816

10817 10818 10819 10820 10821
		/* Don't want to race with allocators so take the groups_sem */
		down_write(&space_info->groups_sem);
		spin_lock(&block_group->lock);
		if (block_group->reserved ||
		    btrfs_block_group_used(&block_group->item) ||
10822 10823
		    block_group->ro ||
		    list_is_singular(&block_group->list)) {
10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836
			/*
			 * We want to bail if we made new allocations or have
			 * outstanding allocations in this block group.  We do
			 * the ro check in case balance is currently acting on
			 * this block group.
			 */
			spin_unlock(&block_group->lock);
			up_write(&space_info->groups_sem);
			goto next;
		}
		spin_unlock(&block_group->lock);

		/* We don't want to force the issue, only flip if it's ok. */
10837
		ret = inc_block_group_ro(block_group, 0);
10838 10839 10840 10841 10842 10843 10844 10845 10846 10847
		up_write(&space_info->groups_sem);
		if (ret < 0) {
			ret = 0;
			goto next;
		}

		/*
		 * Want to do this before we do anything else so we can recover
		 * properly if we fail to join the transaction.
		 */
10848 10849
		trans = btrfs_start_trans_remove_block_group(fs_info,
						     block_group->key.objectid);
10850
		if (IS_ERR(trans)) {
10851
			btrfs_dec_block_group_ro(root, block_group);
10852 10853 10854 10855 10856 10857 10858 10859 10860 10861
			ret = PTR_ERR(trans);
			goto next;
		}

		/*
		 * We could have pending pinned extents for this block group,
		 * just delete them, we don't care about them anymore.
		 */
		start = block_group->key.objectid;
		end = start + block_group->key.offset - 1;
10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873
		/*
		 * Hold the unused_bg_unpin_mutex lock to avoid racing with
		 * btrfs_finish_extent_commit(). If we are at transaction N,
		 * another task might be running finish_extent_commit() for the
		 * previous transaction N - 1, and have seen a range belonging
		 * to the block group in freed_extents[] before we were able to
		 * clear the whole block group range from freed_extents[]. This
		 * means that task can lookup for the block group after we
		 * unpinned it from freed_extents[] and removed it, leading to
		 * a BUG_ON() at btrfs_unpin_extent_range().
		 */
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
10874
		ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
10875
				  EXTENT_DIRTY);
10876
		if (ret) {
10877
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10878
			btrfs_dec_block_group_ro(root, block_group);
10879 10880 10881
			goto end_trans;
		}
		ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
10882
				  EXTENT_DIRTY);
10883
		if (ret) {
10884
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10885
			btrfs_dec_block_group_ro(root, block_group);
10886 10887
			goto end_trans;
		}
10888
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
10889 10890

		/* Reset pinned so btrfs_put_block_group doesn't complain */
10891 10892 10893 10894 10895 10896 10897
		spin_lock(&space_info->lock);
		spin_lock(&block_group->lock);

		space_info->bytes_pinned -= block_group->pinned;
		space_info->bytes_readonly += block_group->pinned;
		percpu_counter_add(&space_info->total_bytes_pinned,
				   -block_group->pinned);
10898 10899
		block_group->pinned = 0;

10900 10901 10902
		spin_unlock(&block_group->lock);
		spin_unlock(&space_info->lock);

10903
		/* DISCARD can flip during remount */
10904
		trimming = btrfs_test_opt(root->fs_info, DISCARD);
10905 10906 10907 10908 10909

		/* Implicit trim during transaction commit. */
		if (trimming)
			btrfs_get_block_group_trimming(block_group);

10910 10911 10912 10913 10914 10915
		/*
		 * Btrfs_remove_chunk will abort the transaction if things go
		 * horribly wrong.
		 */
		ret = btrfs_remove_chunk(trans, root,
					 block_group->key.objectid);
10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928

		if (ret) {
			if (trimming)
				btrfs_put_block_group_trimming(block_group);
			goto end_trans;
		}

		/*
		 * If we're not mounted with -odiscard, we can just forget
		 * about this block group. Otherwise we'll need to wait
		 * until transaction commit to do the actual discard.
		 */
		if (trimming) {
10929 10930 10931 10932 10933 10934
			spin_lock(&fs_info->unused_bgs_lock);
			/*
			 * A concurrent scrub might have added us to the list
			 * fs_info->unused_bgs, so use a list_move operation
			 * to add the block group to the deleted_bgs list.
			 */
10935 10936
			list_move(&block_group->bg_list,
				  &trans->transaction->deleted_bgs);
10937
			spin_unlock(&fs_info->unused_bgs_lock);
10938 10939
			btrfs_get_block_group(block_group);
		}
10940
end_trans:
10941 10942
		btrfs_end_transaction(trans, root);
next:
10943
		mutex_unlock(&fs_info->delete_unused_bgs_mutex);
10944 10945 10946 10947 10948 10949
		btrfs_put_block_group(block_group);
		spin_lock(&fs_info->unused_bgs_lock);
	}
	spin_unlock(&fs_info->unused_bgs_lock);
}

10950 10951 10952
int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
{
	struct btrfs_space_info *space_info;
10953 10954 10955 10956
	struct btrfs_super_block *disk_super;
	u64 features;
	u64 flags;
	int mixed = 0;
10957 10958
	int ret;

10959
	disk_super = fs_info->super_copy;
10960
	if (!btrfs_super_root(disk_super))
10961
		return -EINVAL;
10962

10963 10964 10965
	features = btrfs_super_incompat_flags(disk_super);
	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;
10966

10967
	flags = BTRFS_BLOCK_GROUP_SYSTEM;
10968
	ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10969
	if (ret)
10970
		goto out;
10971

10972 10973
	if (mixed) {
		flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
10974
		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10975 10976
	} else {
		flags = BTRFS_BLOCK_GROUP_METADATA;
10977
		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10978 10979 10980 10981
		if (ret)
			goto out;

		flags = BTRFS_BLOCK_GROUP_DATA;
10982
		ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
10983 10984
	}
out:
10985 10986 10987
	return ret;
}

L
liubo 已提交
10988 10989
int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
{
10990
	return unpin_extent_range(root, start, end, false);
L
liubo 已提交
10991 10992
}

10993 10994 10995 10996 10997 10998 10999 11000 11001 11002 11003 11004 11005 11006 11007 11008 11009 11010 11011 11012 11013 11014 11015 11016 11017 11018 11019 11020 11021 11022 11023 11024 11025 11026 11027 11028 11029 11030 11031 11032 11033 11034 11035 11036 11037 11038 11039 11040 11041 11042 11043 11044 11045 11046 11047 11048 11049 11050 11051 11052 11053 11054 11055 11056 11057 11058 11059 11060 11061 11062 11063 11064 11065 11066 11067 11068 11069 11070 11071 11072 11073 11074 11075 11076 11077 11078 11079
/*
 * It used to be that old block groups would be left around forever.
 * Iterating over them would be enough to trim unused space.  Since we
 * now automatically remove them, we also need to iterate over unallocated
 * space.
 *
 * We don't want a transaction for this since the discard may take a
 * substantial amount of time.  We don't require that a transaction be
 * running, but we do need to take a running transaction into account
 * to ensure that we're not discarding chunks that were released in
 * the current transaction.
 *
 * Holding the chunks lock will prevent other threads from allocating
 * or releasing chunks, but it won't prevent a running transaction
 * from committing and releasing the memory that the pending chunks
 * list head uses.  For that, we need to take a reference to the
 * transaction.
 */
static int btrfs_trim_free_extents(struct btrfs_device *device,
				   u64 minlen, u64 *trimmed)
{
	u64 start = 0, len = 0;
	int ret;

	*trimmed = 0;

	/* Not writeable = nothing to do. */
	if (!device->writeable)
		return 0;

	/* No free space = nothing to do. */
	if (device->total_bytes <= device->bytes_used)
		return 0;

	ret = 0;

	while (1) {
		struct btrfs_fs_info *fs_info = device->dev_root->fs_info;
		struct btrfs_transaction *trans;
		u64 bytes;

		ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
		if (ret)
			return ret;

		down_read(&fs_info->commit_root_sem);

		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			atomic_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ret = find_free_dev_extent_start(trans, device, minlen, start,
						 &start, &len);
		if (trans)
			btrfs_put_transaction(trans);

		if (ret) {
			up_read(&fs_info->commit_root_sem);
			mutex_unlock(&fs_info->chunk_mutex);
			if (ret == -ENOSPC)
				ret = 0;
			break;
		}

		ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
		up_read(&fs_info->commit_root_sem);
		mutex_unlock(&fs_info->chunk_mutex);

		if (ret)
			break;

		start += len;
		*trimmed += bytes;

		if (fatal_signal_pending(current)) {
			ret = -ERESTARTSYS;
			break;
		}

		cond_resched();
	}

	return ret;
}

11080 11081 11082 11083
int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_block_group_cache *cache = NULL;
11084 11085
	struct btrfs_device *device;
	struct list_head *devices;
11086 11087 11088 11089
	u64 group_trimmed;
	u64 start;
	u64 end;
	u64 trimmed = 0;
11090
	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
11091 11092
	int ret = 0;

11093 11094 11095 11096 11097 11098 11099
	/*
	 * try to trim all FS space, our block group may start from non-zero.
	 */
	if (range->len == total_bytes)
		cache = btrfs_lookup_first_block_group(fs_info, range->start);
	else
		cache = btrfs_lookup_block_group(fs_info, range->start);
11100 11101 11102 11103 11104 11105 11106 11107 11108 11109 11110 11111 11112

	while (cache) {
		if (cache->key.objectid >= (range->start + range->len)) {
			btrfs_put_block_group(cache);
			break;
		}

		start = max(range->start, cache->key.objectid);
		end = min(range->start + range->len,
				cache->key.objectid + cache->key.offset);

		if (end - start >= range->minlen) {
			if (!block_group_cache_done(cache)) {
11113
				ret = cache_block_group(cache, 0);
11114 11115 11116 11117 11118 11119 11120 11121 11122
				if (ret) {
					btrfs_put_block_group(cache);
					break;
				}
				ret = wait_block_group_cache_done(cache);
				if (ret) {
					btrfs_put_block_group(cache);
					break;
				}
11123 11124 11125 11126 11127 11128 11129 11130 11131 11132 11133 11134 11135 11136 11137 11138 11139
			}
			ret = btrfs_trim_block_group(cache,
						     &group_trimmed,
						     start,
						     end,
						     range->minlen);

			trimmed += group_trimmed;
			if (ret) {
				btrfs_put_block_group(cache);
				break;
			}
		}

		cache = next_block_group(fs_info->tree_root, cache);
	}

11140 11141 11142 11143 11144 11145 11146 11147 11148 11149 11150 11151
	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
	devices = &root->fs_info->fs_devices->alloc_list;
	list_for_each_entry(device, devices, dev_alloc_list) {
		ret = btrfs_trim_free_extents(device, range->minlen,
					      &group_trimmed);
		if (ret)
			break;

		trimmed += group_trimmed;
	}
	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);

11152 11153 11154
	range->len = trimmed;
	return ret;
}
11155 11156

/*
11157 11158 11159 11160 11161 11162
 * btrfs_{start,end}_write_no_snapshoting() are similar to
 * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
 * data into the page cache through nocow before the subvolume is snapshoted,
 * but flush the data into disk after the snapshot creation, or to prevent
 * operations while snapshoting is ongoing and that cause the snapshot to be
 * inconsistent (writes followed by expanding truncates for example).
11163
 */
11164
void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
11165 11166 11167
{
	percpu_counter_dec(&root->subv_writers->counter);
	/*
11168
	 * Make sure counter is updated before we wake up waiters.
11169 11170 11171 11172 11173 11174
	 */
	smp_mb();
	if (waitqueue_active(&root->subv_writers->wait))
		wake_up(&root->subv_writers->wait);
}

11175
int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
11176
{
11177
	if (atomic_read(&root->will_be_snapshoted))
11178 11179 11180 11181 11182 11183 11184
		return 0;

	percpu_counter_inc(&root->subv_writers->counter);
	/*
	 * Make sure counter is updated before we check for snapshot creation.
	 */
	smp_mb();
11185
	if (atomic_read(&root->will_be_snapshoted)) {
11186
		btrfs_end_write_no_snapshoting(root);
11187 11188 11189 11190
		return 0;
	}
	return 1;
}
11191 11192 11193 11194 11195 11196 11197 11198 11199 11200 11201 11202 11203 11204 11205 11206 11207 11208 11209 11210

static int wait_snapshoting_atomic_t(atomic_t *a)
{
	schedule();
	return 0;
}

void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
{
	while (true) {
		int ret;

		ret = btrfs_start_write_no_snapshoting(root);
		if (ret)
			break;
		wait_on_atomic_t(&root->will_be_snapshoted,
				 wait_snapshoting_atomic_t,
				 TASK_UNINTERRUPTIBLE);
	}
}