ctree.c 144.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2
/*
C
Chris Mason 已提交
3
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
4 5
 */

6
#include <linux/sched.h>
7
#include <linux/slab.h>
8
#include <linux/rbtree.h>
9
#include <linux/mm.h>
10 11
#include "ctree.h"
#include "disk-io.h"
12
#include "transaction.h"
13
#include "print-tree.h"
14
#include "locking.h"
15
#include "volumes.h"
16
#include "qgroup.h"
17

18 19
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
20 21 22
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *ins_key, struct btrfs_path *path,
		      int data_size, int extend);
23
static int push_node_left(struct btrfs_trans_handle *trans,
24
			  struct extent_buffer *dst,
25
			  struct extent_buffer *src, int empty);
26 27 28
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
29 30
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
		    int level, int slot);
31

32 33
static const struct btrfs_csums {
	u16		size;
34 35
	const char	name[10];
	const char	driver[12];
36 37
} btrfs_csums[] = {
	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38
	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39
	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40 41
	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
				     .driver = "blake2b-256" },
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
};

int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
	u16 t = btrfs_super_csum_type(s);
	/*
	 * csum type is validated at mount time
	 */
	return btrfs_csums[t].size;
}

const char *btrfs_super_csum_name(u16 csum_type)
{
	/* csum type is validated at mount time */
	return btrfs_csums[csum_type].name;
}

59 60 61 62 63 64 65
/*
 * Return driver name if defined, otherwise the name that's also a valid driver
 * name
 */
const char *btrfs_super_csum_driver(u16 csum_type)
{
	/* csum type is validated at mount time */
66 67
	return btrfs_csums[csum_type].driver[0] ?
		btrfs_csums[csum_type].driver :
68 69 70
		btrfs_csums[csum_type].name;
}

71
size_t __attribute_const__ btrfs_get_num_csums(void)
72 73 74 75
{
	return ARRAY_SIZE(btrfs_csums);
}

C
Chris Mason 已提交
76
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
77
{
78
	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
C
Chris Mason 已提交
79 80
}

C
Chris Mason 已提交
81
/* this also releases the path */
C
Chris Mason 已提交
82
void btrfs_free_path(struct btrfs_path *p)
83
{
84 85
	if (!p)
		return;
86
	btrfs_release_path(p);
C
Chris Mason 已提交
87
	kmem_cache_free(btrfs_path_cachep, p);
88 89
}

C
Chris Mason 已提交
90 91 92 93 94 95
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
96
noinline void btrfs_release_path(struct btrfs_path *p)
97 98
{
	int i;
99

C
Chris Mason 已提交
100
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101
		p->slots[i] = 0;
102
		if (!p->nodes[i])
103 104
			continue;
		if (p->locks[i]) {
105
			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106 107
			p->locks[i] = 0;
		}
108
		free_extent_buffer(p->nodes[i]);
109
		p->nodes[i] = NULL;
110 111 112
	}
}

C
Chris Mason 已提交
113 114 115 116 117 118 119 120 121 122
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
123 124 125
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
126

127 128 129 130 131 132
	while (1) {
		rcu_read_lock();
		eb = rcu_dereference(root->node);

		/*
		 * RCU really hurts here, we could free up the root node because
133
		 * it was COWed but we may not get the new root node yet so do
134 135 136 137 138 139 140 141 142 143
		 * the inc_not_zero dance and if it doesn't work then
		 * synchronize_rcu and try again.
		 */
		if (atomic_inc_not_zero(&eb->refs)) {
			rcu_read_unlock();
			break;
		}
		rcu_read_unlock();
		synchronize_rcu();
	}
144 145 146
	return eb;
}

147 148 149 150
/*
 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 * just get put onto a simple dirty list.  Transaction walks this list to make
 * sure they get properly updated on disk.
C
Chris Mason 已提交
151
 */
152 153
static void add_root_to_dirty_list(struct btrfs_root *root)
{
154 155
	struct btrfs_fs_info *fs_info = root->fs_info;

156 157 158 159
	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
		return;

160
	spin_lock(&fs_info->trans_lock);
161 162
	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
		/* Want the extent tree to be the last on the list */
163
		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164
			list_move_tail(&root->dirty_list,
165
				       &fs_info->dirty_cowonly_roots);
166 167
		else
			list_move(&root->dirty_list,
168
				  &fs_info->dirty_cowonly_roots);
169
	}
170
	spin_unlock(&fs_info->trans_lock);
171 172
}

C
Chris Mason 已提交
173 174 175 176 177
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
178 179 180 181 182
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
183
	struct btrfs_fs_info *fs_info = root->fs_info;
184 185 186
	struct extent_buffer *cow;
	int ret = 0;
	int level;
187
	struct btrfs_disk_key disk_key;
188

189
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190
		trans->transid != fs_info->running_transaction->transid);
191
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192
		trans->transid != root->last_trans);
193 194

	level = btrfs_header_level(buf);
195 196 197 198
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);
Z
Zheng Yan 已提交
199

200
	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201 202
				     &disk_key, level, buf->start, 0,
				     BTRFS_NESTING_NEW_ROOT);
203
	if (IS_ERR(cow))
204 205
		return PTR_ERR(cow);

206
	copy_extent_buffer_full(cow, buf);
207 208
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
209 210 211 212 213 214 215
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, new_root_objectid);
216

217
	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Y
Yan Zheng 已提交
218

219
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
220
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221
		ret = btrfs_inc_ref(trans, root, cow, 1);
222
	else
223
		ret = btrfs_inc_ref(trans, root, cow, 0);
224

225 226 227 228 229 230 231 232
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
enum mod_log_op {
	MOD_LOG_KEY_REPLACE,
	MOD_LOG_KEY_ADD,
	MOD_LOG_KEY_REMOVE,
	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
	MOD_LOG_MOVE_KEYS,
	MOD_LOG_ROOT_REPLACE,
};

struct tree_mod_root {
	u64 logical;
	u8 level;
};

struct tree_mod_elem {
	struct rb_node node;
250
	u64 logical;
251
	u64 seq;
252 253 254 255 256 257 258 259 260 261 262 263 264
	enum mod_log_op op;

	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
	int slot;

	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
	u64 generation;

	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
	struct btrfs_disk_key key;
	u64 blockptr;

	/* this is used for op == MOD_LOG_MOVE_KEYS */
265 266 267 268
	struct {
		int dst_slot;
		int nr_items;
	} move;
269 270 271 272 273

	/* this is used for op == MOD_LOG_ROOT_REPLACE */
	struct tree_mod_root old_root;
};

274
/*
J
Josef Bacik 已提交
275
 * Pull a new tree mod seq number for our operation.
276
 */
J
Josef Bacik 已提交
277
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
278 279 280 281
{
	return atomic64_inc_return(&fs_info->tree_mod_seq);
}

282 283 284 285 286 287 288 289 290 291
/*
 * This adds a new blocker to the tree mod log's blocker list if the @elem
 * passed does not already have a sequence number set. So when a caller expects
 * to record tree modifications, it should ensure to set elem->seq to zero
 * before calling btrfs_get_tree_mod_seq.
 * Returns a fresh, unused tree log modification sequence number, even if no new
 * blocker was added.
 */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
			   struct seq_list *elem)
292
{
293
	write_lock(&fs_info->tree_mod_log_lock);
294
	if (!elem->seq) {
J
Josef Bacik 已提交
295
		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
296 297
		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
	}
298
	write_unlock(&fs_info->tree_mod_log_lock);
299

J
Josef Bacik 已提交
300
	return elem->seq;
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
}

void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
			    struct seq_list *elem)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct rb_node *next;
	struct tree_mod_elem *tm;
	u64 min_seq = (u64)-1;
	u64 seq_putting = elem->seq;

	if (!seq_putting)
		return;

316
	write_lock(&fs_info->tree_mod_log_lock);
317
	list_del(&elem->list);
318
	elem->seq = 0;
319

320 321 322 323 324 325 326 327 328 329 330 331
	if (!list_empty(&fs_info->tree_mod_seq_list)) {
		struct seq_list *first;

		first = list_first_entry(&fs_info->tree_mod_seq_list,
					 struct seq_list, list);
		if (seq_putting > first->seq) {
			/*
			 * Blocker with lower sequence number exists, we
			 * cannot remove anything from the log.
			 */
			write_unlock(&fs_info->tree_mod_log_lock);
			return;
332
		}
333
		min_seq = first->seq;
334
	}
335

336 337 338 339 340 341 342
	/*
	 * anything that's lower than the lowest existing (read: blocked)
	 * sequence number can be removed from the tree.
	 */
	tm_root = &fs_info->tree_mod_log;
	for (node = rb_first(tm_root); node; node = next) {
		next = rb_next(node);
343
		tm = rb_entry(node, struct tree_mod_elem, node);
344
		if (tm->seq >= min_seq)
345 346 347 348
			continue;
		rb_erase(node, tm_root);
		kfree(tm);
	}
349
	write_unlock(&fs_info->tree_mod_log_lock);
350 351 352 353
}

/*
 * key order of the log:
354
 *       node/leaf start address -> sequence
355
 *
356 357 358
 * The 'start address' is the logical address of the *new* root node
 * for root replace operations, or the logical address of the affected
 * block for all other operations.
359 360 361 362 363 364 365 366
 */
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
{
	struct rb_root *tm_root;
	struct rb_node **new;
	struct rb_node *parent = NULL;
	struct tree_mod_elem *cur;
367

368 369
	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);

J
Josef Bacik 已提交
370
	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
371 372 373 374

	tm_root = &fs_info->tree_mod_log;
	new = &tm_root->rb_node;
	while (*new) {
375
		cur = rb_entry(*new, struct tree_mod_elem, node);
376
		parent = *new;
377
		if (cur->logical < tm->logical)
378
			new = &((*new)->rb_left);
379
		else if (cur->logical > tm->logical)
380
			new = &((*new)->rb_right);
381
		else if (cur->seq < tm->seq)
382
			new = &((*new)->rb_left);
383
		else if (cur->seq > tm->seq)
384
			new = &((*new)->rb_right);
385 386
		else
			return -EEXIST;
387 388 389 390
	}

	rb_link_node(&tm->node, parent, new);
	rb_insert_color(&tm->node, tm_root);
391
	return 0;
392 393
}

394 395 396 397
/*
 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 * this until all tree mod log insertions are recorded in the rb tree and then
398
 * write unlock fs_info::tree_mod_log_lock.
399
 */
400 401 402 403 404
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb) {
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 1;
405 406
	if (eb && btrfs_header_level(eb) == 0)
		return 1;
407

408
	write_lock(&fs_info->tree_mod_log_lock);
409
	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
410
		write_unlock(&fs_info->tree_mod_log_lock);
411 412 413
		return 1;
	}

414 415 416
	return 0;
}

417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb)
{
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 0;
	if (eb && btrfs_header_level(eb) == 0)
		return 0;

	return 1;
}

static struct tree_mod_elem *
alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
		    enum mod_log_op op, gfp_t flags)
433
{
434
	struct tree_mod_elem *tm;
435

436 437
	tm = kzalloc(sizeof(*tm), flags);
	if (!tm)
438
		return NULL;
439

440
	tm->logical = eb->start;
441 442 443 444 445 446 447
	if (op != MOD_LOG_KEY_ADD) {
		btrfs_node_key(eb, &tm->key, slot);
		tm->blockptr = btrfs_node_blockptr(eb, slot);
	}
	tm->op = op;
	tm->slot = slot;
	tm->generation = btrfs_node_ptr_generation(eb, slot);
448
	RB_CLEAR_NODE(&tm->node);
449

450
	return tm;
451 452
}

453 454
static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
		enum mod_log_op op, gfp_t flags)
455
{
456 457 458
	struct tree_mod_elem *tm;
	int ret;

459
	if (!tree_mod_need_log(eb->fs_info, eb))
460 461 462 463 464 465
		return 0;

	tm = alloc_tree_mod_elem(eb, slot, op, flags);
	if (!tm)
		return -ENOMEM;

466
	if (tree_mod_dont_log(eb->fs_info, eb)) {
467
		kfree(tm);
468
		return 0;
469 470
	}

471
	ret = __tree_mod_log_insert(eb->fs_info, tm);
472
	write_unlock(&eb->fs_info->tree_mod_log_lock);
473 474
	if (ret)
		kfree(tm);
475

476
	return ret;
477 478
}

479 480
static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
		int dst_slot, int src_slot, int nr_items)
481
{
482 483 484
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int ret = 0;
485
	int i;
486
	int locked = 0;
487

488
	if (!tree_mod_need_log(eb->fs_info, eb))
J
Jan Schmidt 已提交
489
		return 0;
490

491
	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
492 493 494
	if (!tm_list)
		return -ENOMEM;

495
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
496 497 498 499 500
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}

501
	tm->logical = eb->start;
502 503 504 505 506 507 508
	tm->slot = src_slot;
	tm->move.dst_slot = dst_slot;
	tm->move.nr_items = nr_items;
	tm->op = MOD_LOG_MOVE_KEYS;

	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
509
		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
510 511 512 513 514 515
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

516
	if (tree_mod_dont_log(eb->fs_info, eb))
517 518 519
		goto free_tms;
	locked = 1;

520 521 522 523 524
	/*
	 * When we override something during the move, we log these removals.
	 * This can only happen when we move towards the beginning of the
	 * buffer, i.e. dst_slot < src_slot.
	 */
525
	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
526
		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
527 528
		if (ret)
			goto free_tms;
529 530
	}

531
	ret = __tree_mod_log_insert(eb->fs_info, tm);
532 533
	if (ret)
		goto free_tms;
534
	write_unlock(&eb->fs_info->tree_mod_log_lock);
535
	kfree(tm_list);
J
Jan Schmidt 已提交
536

537 538 539 540
	return 0;
free_tms:
	for (i = 0; i < nr_items; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
541
			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
542 543 544
		kfree(tm_list[i]);
	}
	if (locked)
545
		write_unlock(&eb->fs_info->tree_mod_log_lock);
546 547
	kfree(tm_list);
	kfree(tm);
548

549
	return ret;
550 551
}

552 553 554 555
static inline int
__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
		       struct tree_mod_elem **tm_list,
		       int nritems)
556
{
557
	int i, j;
558 559 560
	int ret;

	for (i = nritems - 1; i >= 0; i--) {
561 562 563 564 565 566 567
		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
		if (ret) {
			for (j = nritems - 1; j > i; j--)
				rb_erase(&tm_list[j]->node,
					 &fs_info->tree_mod_log);
			return ret;
		}
568
	}
569 570

	return 0;
571 572
}

573 574
static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
			 struct extent_buffer *new_root, int log_removal)
575
{
576
	struct btrfs_fs_info *fs_info = old_root->fs_info;
577 578 579 580 581
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int ret = 0;
	int i;
582

583
	if (!tree_mod_need_log(fs_info, NULL))
584 585
		return 0;

586 587
	if (log_removal && btrfs_header_level(old_root) > 0) {
		nritems = btrfs_header_nritems(old_root);
588
		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
589
				  GFP_NOFS);
590 591 592 593 594 595
		if (!tm_list) {
			ret = -ENOMEM;
			goto free_tms;
		}
		for (i = 0; i < nritems; i++) {
			tm_list[i] = alloc_tree_mod_elem(old_root, i,
596
			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
597 598 599 600 601 602
			if (!tm_list[i]) {
				ret = -ENOMEM;
				goto free_tms;
			}
		}
	}
603

604
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
605 606 607 608
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}
609

610
	tm->logical = new_root->start;
611 612 613 614 615
	tm->old_root.logical = old_root->start;
	tm->old_root.level = btrfs_header_level(old_root);
	tm->generation = btrfs_header_generation(old_root);
	tm->op = MOD_LOG_ROOT_REPLACE;

616 617 618 619 620 621 622 623
	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;

	if (tm_list)
		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
	if (!ret)
		ret = __tree_mod_log_insert(fs_info, tm);

624
	write_unlock(&fs_info->tree_mod_log_lock);
625 626 627 628 629 630 631 632 633 634 635 636 637 638 639
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return ret;

free_tms:
	if (tm_list) {
		for (i = 0; i < nritems; i++)
			kfree(tm_list[i]);
		kfree(tm_list);
	}
	kfree(tm);

	return ret;
640 641 642 643 644 645 646 647 648 649 650
}

static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
		      int smallest)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct tree_mod_elem *cur = NULL;
	struct tree_mod_elem *found = NULL;

651
	read_lock(&fs_info->tree_mod_log_lock);
652 653 654
	tm_root = &fs_info->tree_mod_log;
	node = tm_root->rb_node;
	while (node) {
655
		cur = rb_entry(node, struct tree_mod_elem, node);
656
		if (cur->logical < start) {
657
			node = node->rb_left;
658
		} else if (cur->logical > start) {
659
			node = node->rb_right;
660
		} else if (cur->seq < min_seq) {
661 662 663 664
			node = node->rb_left;
		} else if (!smallest) {
			/* we want the node with the highest seq */
			if (found)
665
				BUG_ON(found->seq > cur->seq);
666 667
			found = cur;
			node = node->rb_left;
668
		} else if (cur->seq > min_seq) {
669 670
			/* we want the node with the smallest seq */
			if (found)
671
				BUG_ON(found->seq < cur->seq);
672 673 674 675 676 677 678
			found = cur;
			node = node->rb_right;
		} else {
			found = cur;
			break;
		}
	}
679
	read_unlock(&fs_info->tree_mod_log_lock);
680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706

	return found;
}

/*
 * this returns the element from the log with the smallest time sequence
 * value that's in the log (the oldest log item). any element with a time
 * sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
			   u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 1);
}

/*
 * this returns the element from the log with the largest time sequence
 * value that's in the log (the most recent log item). any element with
 * a time sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 0);
}

707
static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
708
		     struct extent_buffer *src, unsigned long dst_offset,
709
		     unsigned long src_offset, int nr_items)
710
{
711
	struct btrfs_fs_info *fs_info = dst->fs_info;
712 713 714
	int ret = 0;
	struct tree_mod_elem **tm_list = NULL;
	struct tree_mod_elem **tm_list_add, **tm_list_rem;
715
	int i;
716
	int locked = 0;
717

718 719
	if (!tree_mod_need_log(fs_info, NULL))
		return 0;
720

721
	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
722 723
		return 0;

724
	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
725 726 727
			  GFP_NOFS);
	if (!tm_list)
		return -ENOMEM;
728

729 730
	tm_list_add = tm_list;
	tm_list_rem = tm_list + nr_items;
731
	for (i = 0; i < nr_items; i++) {
732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757
		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
		if (!tm_list_rem[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}

		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
		    MOD_LOG_KEY_ADD, GFP_NOFS);
		if (!tm_list_add[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;
	locked = 1;

	for (i = 0; i < nr_items; i++) {
		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
		if (ret)
			goto free_tms;
		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
		if (ret)
			goto free_tms;
758
	}
759

760
	write_unlock(&fs_info->tree_mod_log_lock);
761 762 763 764 765 766 767 768 769 770 771
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nr_items * 2; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
		kfree(tm_list[i]);
	}
	if (locked)
772
		write_unlock(&fs_info->tree_mod_log_lock);
773 774 775
	kfree(tm_list);

	return ret;
776 777
}

778
static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
779
{
780 781 782 783 784 785 786 787
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int i;
	int ret = 0;

	if (btrfs_header_level(eb) == 0)
		return 0;

788
	if (!tree_mod_need_log(eb->fs_info, NULL))
789 790 791
		return 0;

	nritems = btrfs_header_nritems(eb);
792
	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
793 794 795 796 797 798 799 800 801 802 803 804
	if (!tm_list)
		return -ENOMEM;

	for (i = 0; i < nritems; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i,
		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

805
	if (tree_mod_dont_log(eb->fs_info, eb))
806 807
		goto free_tms;

808
	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
809
	write_unlock(&eb->fs_info->tree_mod_log_lock);
810 811 812 813 814 815 816 817 818 819 820 821
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nritems; i++)
		kfree(tm_list[i]);
	kfree(tm_list);

	return ret;
822 823
}

824 825 826 827 828 829 830
/*
 * check if the tree block can be shared by multiple trees
 */
int btrfs_block_can_be_shared(struct btrfs_root *root,
			      struct extent_buffer *buf)
{
	/*
831 832 833
	 * Tree blocks not in shareable trees and tree roots are never shared.
	 * If a block was allocated after the last snapshot and the block was
	 * not allocated by tree relocation, we know the block is not shared.
834
	 */
835
	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
836 837 838 839 840
	    buf != root->node && buf != root->commit_root &&
	    (btrfs_header_generation(buf) <=
	     btrfs_root_last_snapshot(&root->root_item) ||
	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
		return 1;
841

842 843 844 845 846 847
	return 0;
}

static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct extent_buffer *buf,
848 849
				       struct extent_buffer *cow,
				       int *last_ref)
850
{
851
	struct btrfs_fs_info *fs_info = root->fs_info;
852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875
	u64 refs;
	u64 owner;
	u64 flags;
	u64 new_flags = 0;
	int ret;

	/*
	 * Backrefs update rules:
	 *
	 * Always use full backrefs for extent pointers in tree block
	 * allocated by tree relocation.
	 *
	 * If a shared tree block is no longer referenced by its owner
	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
	 * use full backrefs for extent pointers in tree block.
	 *
	 * If a tree block is been relocating
	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
	 * use full backrefs for extent pointers in tree block.
	 * The reason for this is some operations (such as drop tree)
	 * are only allowed for blocks use full backrefs.
	 */

	if (btrfs_block_can_be_shared(root, buf)) {
876
		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
877 878
					       btrfs_header_level(buf), 1,
					       &refs, &flags);
879 880
		if (ret)
			return ret;
881 882
		if (refs == 0) {
			ret = -EROFS;
883
			btrfs_handle_fs_error(fs_info, ret, NULL);
884 885
			return ret;
		}
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902
	} else {
		refs = 1;
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
		else
			flags = 0;
	}

	owner = btrfs_header_owner(buf);
	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));

	if (refs > 1) {
		if ((owner == root->root_key.objectid ||
		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
903
			ret = btrfs_inc_ref(trans, root, buf, 1);
904 905
			if (ret)
				return ret;
906 907 908

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
909
				ret = btrfs_dec_ref(trans, root, buf, 0);
910 911
				if (ret)
					return ret;
912
				ret = btrfs_inc_ref(trans, root, cow, 1);
913 914
				if (ret)
					return ret;
915 916 917 918 919 920
			}
			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
		} else {

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
921
				ret = btrfs_inc_ref(trans, root, cow, 1);
922
			else
923
				ret = btrfs_inc_ref(trans, root, cow, 0);
924 925
			if (ret)
				return ret;
926 927
		}
		if (new_flags != 0) {
928 929
			int level = btrfs_header_level(buf);

930
			ret = btrfs_set_disk_extent_flags(trans, buf,
931
							  new_flags, level, 0);
932 933
			if (ret)
				return ret;
934 935 936 937 938
		}
	} else {
		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
939
				ret = btrfs_inc_ref(trans, root, cow, 1);
940
			else
941
				ret = btrfs_inc_ref(trans, root, cow, 0);
942 943
			if (ret)
				return ret;
944
			ret = btrfs_dec_ref(trans, root, buf, 1);
945 946
			if (ret)
				return ret;
947
		}
948
		btrfs_clean_tree_block(buf);
949
		*last_ref = 1;
950 951 952 953
	}
	return 0;
}

954 955 956 957 958 959 960
static struct extent_buffer *alloc_tree_block_no_bg_flush(
					  struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  u64 parent_start,
					  const struct btrfs_disk_key *disk_key,
					  int level,
					  u64 hint,
961 962
					  u64 empty_size,
					  enum btrfs_lock_nesting nest)
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *ret;

	/*
	 * If we are COWing a node/leaf from the extent, chunk, device or free
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 * For similar reasons, we also need to delay flushing pending block
	 * groups when splitting a leaf or node, from one of those trees, since
	 * we are holding a write lock on it and its parent or when inserting a
	 * new root node for one of those trees.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	ret = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, disk_key, level,
991
				     hint, empty_size, nest);
992 993 994 995 996
	trans->can_flush_pending_bgs = true;

	return ret;
}

C
Chris Mason 已提交
997
/*
C
Chris Mason 已提交
998 999 1000 1001
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
1002 1003 1004
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
1005 1006 1007
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
1008
 */
C
Chris Mason 已提交
1009
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1010 1011 1012 1013
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
1014 1015
			     u64 search_start, u64 empty_size,
			     enum btrfs_lock_nesting nest)
C
Chris Mason 已提交
1016
{
1017
	struct btrfs_fs_info *fs_info = root->fs_info;
1018
	struct btrfs_disk_key disk_key;
1019
	struct extent_buffer *cow;
1020
	int level, ret;
1021
	int last_ref = 0;
1022
	int unlock_orig = 0;
1023
	u64 parent_start = 0;
1024

1025 1026 1027
	if (*cow_ret == buf)
		unlock_orig = 1;

1028
	btrfs_assert_tree_locked(buf);
1029

1030
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1031
		trans->transid != fs_info->running_transaction->transid);
1032
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1033
		trans->transid != root->last_trans);
1034

1035
	level = btrfs_header_level(buf);
Z
Zheng Yan 已提交
1036

1037 1038 1039 1040 1041
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);

1042 1043
	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
		parent_start = parent->start;
1044

1045
	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1046
					   level, search_start, empty_size, nest);
1047 1048
	if (IS_ERR(cow))
		return PTR_ERR(cow);
1049

1050 1051
	/* cow is set to blocking by btrfs_init_new_buffer */

1052
	copy_extent_buffer_full(cow, buf);
1053
	btrfs_set_header_bytenr(cow, cow->start);
1054
	btrfs_set_header_generation(cow, trans->transid);
1055 1056 1057 1058 1059 1060 1061
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, root->root_key.objectid);
1062

1063
	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Y
Yan Zheng 已提交
1064

1065
	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1066
	if (ret) {
1067
		btrfs_abort_transaction(trans, ret);
1068 1069
		return ret;
	}
Z
Zheng Yan 已提交
1070

1071
	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1072
		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1073
		if (ret) {
1074
			btrfs_abort_transaction(trans, ret);
1075
			return ret;
1076
		}
1077
	}
1078

C
Chris Mason 已提交
1079
	if (buf == root->node) {
1080
		WARN_ON(parent && parent != buf);
1081 1082 1083
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			parent_start = buf->start;
1084

D
David Sterba 已提交
1085
		atomic_inc(&cow->refs);
1086 1087
		ret = tree_mod_log_insert_root(root->node, cow, 1);
		BUG_ON(ret < 0);
1088
		rcu_assign_pointer(root->node, cow);
1089

1090
		btrfs_free_tree_block(trans, root, buf, parent_start,
1091
				      last_ref);
1092
		free_extent_buffer(buf);
1093
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
1094
	} else {
1095
		WARN_ON(trans->transid != btrfs_header_generation(parent));
1096
		tree_mod_log_insert_key(parent, parent_slot,
1097
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1098
		btrfs_set_node_blockptr(parent, parent_slot,
1099
					cow->start);
1100 1101
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
1102
		btrfs_mark_buffer_dirty(parent);
1103
		if (last_ref) {
1104
			ret = tree_mod_log_free_eb(buf);
1105
			if (ret) {
1106
				btrfs_abort_transaction(trans, ret);
1107 1108 1109
				return ret;
			}
		}
1110
		btrfs_free_tree_block(trans, root, buf, parent_start,
1111
				      last_ref);
C
Chris Mason 已提交
1112
	}
1113 1114
	if (unlock_orig)
		btrfs_tree_unlock(buf);
1115
	free_extent_buffer_stale(buf);
C
Chris Mason 已提交
1116
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
1117
	*cow_ret = cow;
C
Chris Mason 已提交
1118 1119 1120
	return 0;
}

J
Jan Schmidt 已提交
1121 1122 1123 1124
/*
 * returns the logical address of the oldest predecessor of the given root.
 * entries older than time_seq are ignored.
 */
1125 1126
static struct tree_mod_elem *__tree_mod_log_oldest_root(
		struct extent_buffer *eb_root, u64 time_seq)
J
Jan Schmidt 已提交
1127 1128 1129
{
	struct tree_mod_elem *tm;
	struct tree_mod_elem *found = NULL;
1130
	u64 root_logical = eb_root->start;
J
Jan Schmidt 已提交
1131 1132 1133
	int looped = 0;

	if (!time_seq)
1134
		return NULL;
J
Jan Schmidt 已提交
1135 1136

	/*
1137 1138 1139 1140
	 * the very last operation that's logged for a root is the
	 * replacement operation (if it is replaced at all). this has
	 * the logical address of the *new* root, making it the very
	 * first operation that's logged for this root.
J
Jan Schmidt 已提交
1141 1142
	 */
	while (1) {
1143
		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
J
Jan Schmidt 已提交
1144 1145
						time_seq);
		if (!looped && !tm)
1146
			return NULL;
J
Jan Schmidt 已提交
1147
		/*
1148 1149 1150
		 * if there are no tree operation for the oldest root, we simply
		 * return it. this should only happen if that (old) root is at
		 * level 0.
J
Jan Schmidt 已提交
1151
		 */
1152 1153
		if (!tm)
			break;
J
Jan Schmidt 已提交
1154

1155 1156 1157 1158 1159
		/*
		 * if there's an operation that's not a root replacement, we
		 * found the oldest version of our root. normally, we'll find a
		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
		 */
J
Jan Schmidt 已提交
1160 1161 1162 1163 1164 1165 1166 1167
		if (tm->op != MOD_LOG_ROOT_REPLACE)
			break;

		found = tm;
		root_logical = tm->old_root.logical;
		looped = 1;
	}

1168 1169 1170 1171
	/* if there's no old root to return, return what we found instead */
	if (!found)
		found = tm;

J
Jan Schmidt 已提交
1172 1173 1174 1175 1176
	return found;
}

/*
 * tm is a pointer to the first operation to rewind within eb. then, all
1177
 * previous operations will be rewound (until we reach something older than
J
Jan Schmidt 已提交
1178 1179 1180
 * time_seq).
 */
static void
1181 1182
__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
		      u64 time_seq, struct tree_mod_elem *first_tm)
J
Jan Schmidt 已提交
1183 1184 1185 1186 1187 1188 1189 1190 1191
{
	u32 n;
	struct rb_node *next;
	struct tree_mod_elem *tm = first_tm;
	unsigned long o_dst;
	unsigned long o_src;
	unsigned long p_size = sizeof(struct btrfs_key_ptr);

	n = btrfs_header_nritems(eb);
1192
	read_lock(&fs_info->tree_mod_log_lock);
1193
	while (tm && tm->seq >= time_seq) {
J
Jan Schmidt 已提交
1194 1195 1196 1197 1198 1199 1200 1201
		/*
		 * all the operations are recorded with the operator used for
		 * the modification. as we're going backwards, we do the
		 * opposite of each operation here.
		 */
		switch (tm->op) {
		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
			BUG_ON(tm->slot < n);
1202
			fallthrough;
1203
		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1204
		case MOD_LOG_KEY_REMOVE:
J
Jan Schmidt 已提交
1205 1206 1207 1208
			btrfs_set_node_key(eb, &tm->key, tm->slot);
			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
			btrfs_set_node_ptr_generation(eb, tm->slot,
						      tm->generation);
1209
			n++;
J
Jan Schmidt 已提交
1210 1211 1212 1213 1214 1215 1216 1217 1218
			break;
		case MOD_LOG_KEY_REPLACE:
			BUG_ON(tm->slot >= n);
			btrfs_set_node_key(eb, &tm->key, tm->slot);
			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
			btrfs_set_node_ptr_generation(eb, tm->slot,
						      tm->generation);
			break;
		case MOD_LOG_KEY_ADD:
1219
			/* if a move operation is needed it's in the log */
J
Jan Schmidt 已提交
1220 1221 1222
			n--;
			break;
		case MOD_LOG_MOVE_KEYS:
1223 1224 1225
			o_dst = btrfs_node_key_ptr_offset(tm->slot);
			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
			memmove_extent_buffer(eb, o_dst, o_src,
J
Jan Schmidt 已提交
1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
					      tm->move.nr_items * p_size);
			break;
		case MOD_LOG_ROOT_REPLACE:
			/*
			 * this operation is special. for roots, this must be
			 * handled explicitly before rewinding.
			 * for non-roots, this operation may exist if the node
			 * was a root: root A -> child B; then A gets empty and
			 * B is promoted to the new root. in the mod log, we'll
			 * have a root-replace operation for B, a tree block
			 * that is no root. we simply ignore that operation.
			 */
			break;
		}
		next = rb_next(&tm->node);
		if (!next)
			break;
1243
		tm = rb_entry(next, struct tree_mod_elem, node);
1244
		if (tm->logical != first_tm->logical)
J
Jan Schmidt 已提交
1245 1246
			break;
	}
1247
	read_unlock(&fs_info->tree_mod_log_lock);
J
Jan Schmidt 已提交
1248 1249 1250
	btrfs_set_header_nritems(eb, n);
}

1251
/*
1252
 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1253 1254 1255 1256 1257
 * is returned. If rewind operations happen, a fresh buffer is returned. The
 * returned buffer is always read-locked. If the returned buffer is not the
 * input buffer, the lock on the input buffer is released and the input buffer
 * is freed (its refcount is decremented).
 */
J
Jan Schmidt 已提交
1258
static struct extent_buffer *
1259 1260
tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
		    struct extent_buffer *eb, u64 time_seq)
J
Jan Schmidt 已提交
1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274
{
	struct extent_buffer *eb_rewin;
	struct tree_mod_elem *tm;

	if (!time_seq)
		return eb;

	if (btrfs_header_level(eb) == 0)
		return eb;

	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
	if (!tm)
		return eb;

1275
	btrfs_set_path_blocking(path);
1276
	btrfs_set_lock_blocking_read(eb);
1277

J
Jan Schmidt 已提交
1278 1279
	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
		BUG_ON(tm->slot != 0);
1280
		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1281
		if (!eb_rewin) {
1282
			btrfs_tree_read_unlock_blocking(eb);
1283 1284 1285
			free_extent_buffer(eb);
			return NULL;
		}
J
Jan Schmidt 已提交
1286 1287 1288 1289
		btrfs_set_header_bytenr(eb_rewin, eb->start);
		btrfs_set_header_backref_rev(eb_rewin,
					     btrfs_header_backref_rev(eb));
		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1290
		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
J
Jan Schmidt 已提交
1291 1292
	} else {
		eb_rewin = btrfs_clone_extent_buffer(eb);
1293
		if (!eb_rewin) {
1294
			btrfs_tree_read_unlock_blocking(eb);
1295 1296 1297
			free_extent_buffer(eb);
			return NULL;
		}
J
Jan Schmidt 已提交
1298 1299
	}

1300
	btrfs_tree_read_unlock_blocking(eb);
J
Jan Schmidt 已提交
1301 1302
	free_extent_buffer(eb);

1303 1304
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
				       eb_rewin, btrfs_header_level(eb_rewin));
1305
	btrfs_tree_read_lock(eb_rewin);
1306
	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1307
	WARN_ON(btrfs_header_nritems(eb_rewin) >
1308
		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
J
Jan Schmidt 已提交
1309 1310 1311 1312

	return eb_rewin;
}

1313 1314 1315 1316 1317 1318 1319
/*
 * get_old_root() rewinds the state of @root's root node to the given @time_seq
 * value. If there are no changes, the current root->root_node is returned. If
 * anything changed in between, there's a fresh buffer allocated on which the
 * rewind operations are done. In any case, the returned buffer is read locked.
 * Returns NULL on error (with no locks held).
 */
J
Jan Schmidt 已提交
1320 1321 1322
static inline struct extent_buffer *
get_old_root(struct btrfs_root *root, u64 time_seq)
{
1323
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Jan Schmidt 已提交
1324
	struct tree_mod_elem *tm;
1325 1326
	struct extent_buffer *eb = NULL;
	struct extent_buffer *eb_root;
1327
	u64 eb_root_owner = 0;
1328
	struct extent_buffer *old;
1329
	struct tree_mod_root *old_root = NULL;
1330
	u64 old_generation = 0;
1331
	u64 logical;
1332
	int level;
J
Jan Schmidt 已提交
1333

1334
	eb_root = btrfs_read_lock_root_node(root);
1335
	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
J
Jan Schmidt 已提交
1336
	if (!tm)
1337
		return eb_root;
J
Jan Schmidt 已提交
1338

1339 1340 1341 1342
	if (tm->op == MOD_LOG_ROOT_REPLACE) {
		old_root = &tm->old_root;
		old_generation = tm->generation;
		logical = old_root->logical;
1343
		level = old_root->level;
1344
	} else {
1345
		logical = eb_root->start;
1346
		level = btrfs_header_level(eb_root);
1347
	}
J
Jan Schmidt 已提交
1348

1349
	tm = tree_mod_log_search(fs_info, logical, time_seq);
1350
	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1351 1352
		btrfs_tree_read_unlock(eb_root);
		free_extent_buffer(eb_root);
1353
		old = read_tree_block(fs_info, logical, 0, level, NULL);
1354 1355 1356
		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
			if (!IS_ERR(old))
				free_extent_buffer(old);
1357 1358 1359
			btrfs_warn(fs_info,
				   "failed to read tree block %llu from get_old_root",
				   logical);
1360
		} else {
1361 1362
			eb = btrfs_clone_extent_buffer(old);
			free_extent_buffer(old);
1363 1364
		}
	} else if (old_root) {
1365
		eb_root_owner = btrfs_header_owner(eb_root);
1366 1367
		btrfs_tree_read_unlock(eb_root);
		free_extent_buffer(eb_root);
1368
		eb = alloc_dummy_extent_buffer(fs_info, logical);
1369
	} else {
1370
		btrfs_set_lock_blocking_read(eb_root);
1371
		eb = btrfs_clone_extent_buffer(eb_root);
1372
		btrfs_tree_read_unlock_blocking(eb_root);
1373
		free_extent_buffer(eb_root);
1374 1375
	}

1376 1377
	if (!eb)
		return NULL;
1378
	if (old_root) {
J
Jan Schmidt 已提交
1379 1380
		btrfs_set_header_bytenr(eb, eb->start);
		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1381
		btrfs_set_header_owner(eb, eb_root_owner);
1382 1383
		btrfs_set_header_level(eb, old_root->level);
		btrfs_set_header_generation(eb, old_generation);
J
Jan Schmidt 已提交
1384
	}
1385 1386 1387
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
				       btrfs_header_level(eb));
	btrfs_tree_read_lock(eb);
1388
	if (tm)
1389
		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1390 1391
	else
		WARN_ON(btrfs_header_level(eb) != 0);
1392
	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
J
Jan Schmidt 已提交
1393 1394 1395 1396

	return eb;
}

J
Jan Schmidt 已提交
1397 1398 1399 1400
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
	struct tree_mod_elem *tm;
	int level;
1401
	struct extent_buffer *eb_root = btrfs_root_node(root);
J
Jan Schmidt 已提交
1402

1403
	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
J
Jan Schmidt 已提交
1404 1405 1406
	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
		level = tm->old_root.level;
	} else {
1407
		level = btrfs_header_level(eb_root);
J
Jan Schmidt 已提交
1408
	}
1409
	free_extent_buffer(eb_root);
J
Jan Schmidt 已提交
1410 1411 1412 1413

	return level;
}

1414 1415 1416 1417
static inline int should_cow_block(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct extent_buffer *buf)
{
1418
	if (btrfs_is_testing(root->fs_info))
1419
		return 0;
1420

1421 1422
	/* Ensure we can see the FORCE_COW bit */
	smp_mb__before_atomic();
1423 1424 1425 1426 1427 1428 1429 1430

	/*
	 * We do not need to cow a block if
	 * 1) this block is not created or changed in this transaction;
	 * 2) this block does not belong to TREE_RELOC tree;
	 * 3) the root is not forced COW.
	 *
	 * What is forced COW:
1431
	 *    when we create snapshot during committing the transaction,
1432
	 *    after we've finished copying src root, we must COW the shared
1433 1434
	 *    block to ensure the metadata consistency.
	 */
1435 1436 1437
	if (btrfs_header_generation(buf) == trans->transid &&
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1438
	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1439
	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1440 1441 1442 1443
		return 0;
	return 1;
}

C
Chris Mason 已提交
1444 1445
/*
 * cows a single block, see __btrfs_cow_block for the real work.
1446
 * This version of it has extra checks so that a block isn't COWed more than
C
Chris Mason 已提交
1447 1448
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
1449
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1450 1451
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
1452 1453
		    struct extent_buffer **cow_ret,
		    enum btrfs_lock_nesting nest)
1454
{
1455
	struct btrfs_fs_info *fs_info = root->fs_info;
1456
	u64 search_start;
1457
	int ret;
C
Chris Mason 已提交
1458

1459 1460 1461 1462
	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
		btrfs_err(fs_info,
			"COW'ing blocks on a fs root that's being dropped");

1463
	if (trans->transaction != fs_info->running_transaction)
J
Julia Lawall 已提交
1464
		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1465
		       trans->transid,
1466
		       fs_info->running_transaction->transid);
J
Julia Lawall 已提交
1467

1468
	if (trans->transid != fs_info->generation)
J
Julia Lawall 已提交
1469
		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1470
		       trans->transid, fs_info->generation);
C
Chris Mason 已提交
1471

1472
	if (!should_cow_block(trans, root, buf)) {
1473
		trans->dirty = true;
1474 1475 1476
		*cow_ret = buf;
		return 0;
	}
1477

1478
	search_start = buf->start & ~((u64)SZ_1G - 1);
1479 1480

	if (parent)
1481 1482
		btrfs_set_lock_blocking_write(parent);
	btrfs_set_lock_blocking_write(buf);
1483

1484 1485 1486 1487 1488 1489 1490
	/*
	 * Before CoWing this block for later modification, check if it's
	 * the subtree root and do the delayed subtree trace if needed.
	 *
	 * Also We don't care about the error, as it's handled internally.
	 */
	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1491
	ret = __btrfs_cow_block(trans, root, buf, parent,
1492
				 parent_slot, cow_ret, search_start, 0, nest);
1493 1494 1495

	trace_btrfs_cow_block(root, buf, *cow_ret);

1496
	return ret;
1497 1498
}

C
Chris Mason 已提交
1499 1500 1501 1502
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
1503
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1504
{
1505
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1506
		return 1;
1507
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1508 1509 1510 1511
		return 1;
	return 0;
}

1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
#ifdef __LITTLE_ENDIAN

/*
 * Compare two keys, on little-endian the disk order is same as CPU order and
 * we can avoid the conversion.
 */
static int comp_keys(const struct btrfs_disk_key *disk_key,
		     const struct btrfs_key *k2)
{
	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;

	return btrfs_comp_cpu_keys(k1, k2);
}

#else

1528 1529 1530
/*
 * compare two keys in a memcmp fashion
 */
1531 1532
static int comp_keys(const struct btrfs_disk_key *disk,
		     const struct btrfs_key *k2)
1533 1534 1535 1536 1537
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

1538
	return btrfs_comp_cpu_keys(&k1, k2);
1539
}
1540
#endif
1541

1542 1543 1544
/*
 * same as comp_keys only with two btrfs_key's
 */
1545
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
1561

C
Chris Mason 已提交
1562 1563 1564 1565 1566
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
1567
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1568
		       struct btrfs_root *root, struct extent_buffer *parent,
1569
		       int start_slot, u64 *last_ret,
1570
		       struct btrfs_key *progress)
1571
{
1572
	struct btrfs_fs_info *fs_info = root->fs_info;
1573
	struct extent_buffer *cur;
1574
	u64 blocknr;
1575
	u64 gen;
1576 1577
	u64 search_start = *last_ret;
	u64 last_block = 0;
1578 1579 1580 1581 1582
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
1583
	int parent_level;
1584 1585
	int uptodate;
	u32 blocksize;
1586 1587
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
1588

1589 1590
	parent_level = btrfs_header_level(parent);

1591 1592
	WARN_ON(trans->transaction != fs_info->running_transaction);
	WARN_ON(trans->transid != fs_info->generation);
1593

1594
	parent_nritems = btrfs_header_nritems(parent);
1595
	blocksize = fs_info->nodesize;
1596
	end_slot = parent_nritems - 1;
1597

1598
	if (parent_nritems <= 1)
1599 1600
		return 0;

1601
	btrfs_set_lock_blocking_write(parent);
1602

1603
	for (i = start_slot; i <= end_slot; i++) {
1604
		struct btrfs_key first_key;
1605
		int close = 1;
1606

1607 1608 1609 1610 1611
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
1612
		blocknr = btrfs_node_blockptr(parent, i);
1613
		gen = btrfs_node_ptr_generation(parent, i);
1614
		btrfs_node_key_to_cpu(parent, &first_key, i);
1615 1616
		if (last_block == 0)
			last_block = blocknr;
1617

1618
		if (i > 0) {
1619 1620
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
1621
		}
1622
		if (!close && i < end_slot) {
1623 1624
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
1625
		}
1626 1627
		if (close) {
			last_block = blocknr;
1628
			continue;
1629
		}
1630

1631
		cur = find_extent_buffer(fs_info, blocknr);
1632
		if (cur)
1633
			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1634 1635
		else
			uptodate = 0;
1636
		if (!cur || !uptodate) {
1637
			if (!cur) {
1638 1639 1640
				cur = read_tree_block(fs_info, blocknr, gen,
						      parent_level - 1,
						      &first_key);
1641 1642 1643
				if (IS_ERR(cur)) {
					return PTR_ERR(cur);
				} else if (!extent_buffer_uptodate(cur)) {
1644
					free_extent_buffer(cur);
1645
					return -EIO;
1646
				}
1647
			} else if (!uptodate) {
1648 1649
				err = btrfs_read_buffer(cur, gen,
						parent_level - 1,&first_key);
1650 1651 1652 1653
				if (err) {
					free_extent_buffer(cur);
					return err;
				}
1654
			}
1655
		}
1656
		if (search_start == 0)
1657
			search_start = last_block;
1658

1659
		btrfs_tree_lock(cur);
1660
		btrfs_set_lock_blocking_write(cur);
1661
		err = __btrfs_cow_block(trans, root, cur, parent, i,
1662
					&cur, search_start,
1663
					min(16 * blocksize,
1664 1665
					    (end_slot - i) * blocksize),
					BTRFS_NESTING_COW);
Y
Yan 已提交
1666
		if (err) {
1667
			btrfs_tree_unlock(cur);
1668
			free_extent_buffer(cur);
1669
			break;
Y
Yan 已提交
1670
		}
1671 1672
		search_start = cur->start;
		last_block = cur->start;
1673
		*last_ret = search_start;
1674 1675
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
1676 1677 1678 1679
	}
	return err;
}

C
Chris Mason 已提交
1680
/*
1681 1682 1683
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
1684 1685 1686 1687 1688 1689
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
1690
static noinline int generic_bin_search(struct extent_buffer *eb,
1691 1692
				       unsigned long p, int item_size,
				       const struct btrfs_key *key,
1693
				       int max, int *slot)
1694 1695 1696 1697
{
	int low = 0;
	int high = max;
	int ret;
1698
	const int key_size = sizeof(struct btrfs_disk_key);
1699

1700 1701 1702 1703 1704 1705 1706 1707
	if (low > high) {
		btrfs_err(eb->fs_info,
		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
			  __func__, low, high, eb->start,
			  btrfs_header_owner(eb), btrfs_header_level(eb));
		return -EINVAL;
	}

C
Chris Mason 已提交
1708
	while (low < high) {
1709 1710 1711 1712 1713 1714
		unsigned long oip;
		unsigned long offset;
		struct btrfs_disk_key *tmp;
		struct btrfs_disk_key unaligned;
		int mid;

1715
		mid = (low + high) / 2;
1716
		offset = p + mid * item_size;
1717
		oip = offset_in_page(offset);
1718

1719 1720 1721
		if (oip + key_size <= PAGE_SIZE) {
			const unsigned long idx = offset >> PAGE_SHIFT;
			char *kaddr = page_address(eb->pages[idx]);
1722

1723
			tmp = (struct btrfs_disk_key *)(kaddr + oip);
1724
		} else {
1725 1726
			read_extent_buffer(eb, &unaligned, offset, key_size);
			tmp = &unaligned;
1727
		}
1728

1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
			return 0;
		}
	}
	*slot = low;
	return 1;
}

C
Chris Mason 已提交
1744 1745 1746 1747
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
1748
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1749
		     int *slot)
1750
{
1751
	if (btrfs_header_level(eb) == 0)
1752 1753
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
1754
					  sizeof(struct btrfs_item),
1755
					  key, btrfs_header_nritems(eb),
1756
					  slot);
1757
	else
1758 1759
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
1760
					  sizeof(struct btrfs_key_ptr),
1761
					  key, btrfs_header_nritems(eb),
1762
					  slot);
1763 1764
}

1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780
static void root_add_used(struct btrfs_root *root, u32 size)
{
	spin_lock(&root->accounting_lock);
	btrfs_set_root_used(&root->root_item,
			    btrfs_root_used(&root->root_item) + size);
	spin_unlock(&root->accounting_lock);
}

static void root_sub_used(struct btrfs_root *root, u32 size)
{
	spin_lock(&root->accounting_lock);
	btrfs_set_root_used(&root->root_item,
			    btrfs_root_used(&root->root_item) - size);
	spin_unlock(&root->accounting_lock);
}

C
Chris Mason 已提交
1781 1782 1783
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 */
1784 1785
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
					   int slot)
1786
{
1787
	int level = btrfs_header_level(parent);
1788
	struct extent_buffer *eb;
1789
	struct btrfs_key first_key;
1790

1791 1792
	if (slot < 0 || slot >= btrfs_header_nritems(parent))
		return ERR_PTR(-ENOENT);
1793 1794 1795

	BUG_ON(level == 0);

1796
	btrfs_node_key_to_cpu(parent, &first_key, slot);
1797
	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1798 1799
			     btrfs_node_ptr_generation(parent, slot),
			     level - 1, &first_key);
1800 1801 1802
	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
		free_extent_buffer(eb);
		eb = ERR_PTR(-EIO);
1803 1804 1805
	}

	return eb;
1806 1807
}

C
Chris Mason 已提交
1808 1809 1810 1811 1812
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
1813
static noinline int balance_level(struct btrfs_trans_handle *trans,
1814 1815
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
1816
{
1817
	struct btrfs_fs_info *fs_info = root->fs_info;
1818 1819 1820 1821
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1822 1823 1824 1825
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
1826
	u64 orig_ptr;
1827

1828
	ASSERT(level > 0);
1829

1830
	mid = path->nodes[level];
1831

1832 1833
	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1834 1835
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

1836
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1837

L
Li Zefan 已提交
1838
	if (level < BTRFS_MAX_LEVEL - 1) {
1839
		parent = path->nodes[level + 1];
L
Li Zefan 已提交
1840 1841
		pslot = path->slots[level + 1];
	}
1842

C
Chris Mason 已提交
1843 1844 1845 1846
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
1847 1848
	if (!parent) {
		struct extent_buffer *child;
1849

1850
		if (btrfs_header_nritems(mid) != 1)
1851 1852 1853
			return 0;

		/* promote the child to a root */
1854
		child = btrfs_read_node_slot(mid, 0);
1855 1856
		if (IS_ERR(child)) {
			ret = PTR_ERR(child);
1857
			btrfs_handle_fs_error(fs_info, ret, NULL);
1858 1859 1860
			goto enospc;
		}

1861
		btrfs_tree_lock(child);
1862
		btrfs_set_lock_blocking_write(child);
1863 1864
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
				      BTRFS_NESTING_COW);
1865 1866 1867 1868 1869
		if (ret) {
			btrfs_tree_unlock(child);
			free_extent_buffer(child);
			goto enospc;
		}
1870

1871 1872
		ret = tree_mod_log_insert_root(root->node, child, 1);
		BUG_ON(ret < 0);
1873
		rcu_assign_pointer(root->node, child);
1874

1875
		add_root_to_dirty_list(root);
1876
		btrfs_tree_unlock(child);
1877

1878
		path->locks[level] = 0;
1879
		path->nodes[level] = NULL;
1880
		btrfs_clean_tree_block(mid);
1881
		btrfs_tree_unlock(mid);
1882
		/* once for the path */
1883
		free_extent_buffer(mid);
1884 1885

		root_sub_used(root, mid->len);
1886
		btrfs_free_tree_block(trans, root, mid, 0, 1);
1887
		/* once for the root ptr */
1888
		free_extent_buffer_stale(mid);
1889
		return 0;
1890
	}
1891
	if (btrfs_header_nritems(mid) >
1892
	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1893 1894
		return 0;

1895
	left = btrfs_read_node_slot(parent, pslot - 1);
1896 1897 1898
	if (IS_ERR(left))
		left = NULL;

1899
	if (left) {
1900
		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1901
		btrfs_set_lock_blocking_write(left);
1902
		wret = btrfs_cow_block(trans, root, left,
1903
				       parent, pslot - 1, &left,
1904
				       BTRFS_NESTING_LEFT_COW);
1905 1906 1907 1908
		if (wret) {
			ret = wret;
			goto enospc;
		}
1909
	}
1910

1911
	right = btrfs_read_node_slot(parent, pslot + 1);
1912 1913 1914
	if (IS_ERR(right))
		right = NULL;

1915
	if (right) {
1916
		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1917
		btrfs_set_lock_blocking_write(right);
1918
		wret = btrfs_cow_block(trans, root, right,
1919
				       parent, pslot + 1, &right,
1920
				       BTRFS_NESTING_RIGHT_COW);
1921 1922 1923 1924 1925 1926 1927
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
1928 1929
	if (left) {
		orig_slot += btrfs_header_nritems(left);
1930
		wret = push_node_left(trans, left, mid, 1);
1931 1932
		if (wret < 0)
			ret = wret;
1933
	}
1934 1935 1936 1937

	/*
	 * then try to empty the right most buffer into the middle
	 */
1938
	if (right) {
1939
		wret = push_node_left(trans, mid, right, 1);
1940
		if (wret < 0 && wret != -ENOSPC)
1941
			ret = wret;
1942
		if (btrfs_header_nritems(right) == 0) {
1943
			btrfs_clean_tree_block(right);
1944
			btrfs_tree_unlock(right);
1945
			del_ptr(root, path, level + 1, pslot + 1);
1946
			root_sub_used(root, right->len);
1947
			btrfs_free_tree_block(trans, root, right, 0, 1);
1948
			free_extent_buffer_stale(right);
1949
			right = NULL;
1950
		} else {
1951 1952
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
1953 1954 1955
			ret = tree_mod_log_insert_key(parent, pslot + 1,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
1956 1957
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1958 1959
		}
	}
1960
	if (btrfs_header_nritems(mid) == 1) {
1961 1962 1963 1964 1965 1966 1967 1968 1969
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1970 1971
		if (!left) {
			ret = -EROFS;
1972
			btrfs_handle_fs_error(fs_info, ret, NULL);
1973 1974
			goto enospc;
		}
1975
		wret = balance_node_right(trans, mid, left);
1976
		if (wret < 0) {
1977
			ret = wret;
1978 1979
			goto enospc;
		}
1980
		if (wret == 1) {
1981
			wret = push_node_left(trans, left, mid, 1);
1982 1983 1984
			if (wret < 0)
				ret = wret;
		}
1985 1986
		BUG_ON(wret == 1);
	}
1987
	if (btrfs_header_nritems(mid) == 0) {
1988
		btrfs_clean_tree_block(mid);
1989
		btrfs_tree_unlock(mid);
1990
		del_ptr(root, path, level + 1, pslot);
1991
		root_sub_used(root, mid->len);
1992
		btrfs_free_tree_block(trans, root, mid, 0, 1);
1993
		free_extent_buffer_stale(mid);
1994
		mid = NULL;
1995 1996
	} else {
		/* update the parent key to reflect our changes */
1997 1998
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
1999 2000 2001
		ret = tree_mod_log_insert_key(parent, pslot,
				MOD_LOG_KEY_REPLACE, GFP_NOFS);
		BUG_ON(ret < 0);
2002 2003
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
2004
	}
2005

2006
	/* update the path */
2007 2008
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
D
David Sterba 已提交
2009
			atomic_inc(&left->refs);
2010
			/* left was locked after cow */
2011
			path->nodes[level] = left;
2012 2013
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
2014 2015
			if (mid) {
				btrfs_tree_unlock(mid);
2016
				free_extent_buffer(mid);
2017
			}
2018
		} else {
2019
			orig_slot -= btrfs_header_nritems(left);
2020 2021 2022
			path->slots[level] = orig_slot;
		}
	}
2023
	/* double check we haven't messed things up */
C
Chris Mason 已提交
2024
	if (orig_ptr !=
2025
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2026
		BUG();
2027
enospc:
2028 2029
	if (right) {
		btrfs_tree_unlock(right);
2030
		free_extent_buffer(right);
2031 2032 2033 2034
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
2035
		free_extent_buffer(left);
2036
	}
2037 2038 2039
	return ret;
}

C
Chris Mason 已提交
2040 2041 2042 2043
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
2044
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2045 2046
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
2047
{
2048
	struct btrfs_fs_info *fs_info = root->fs_info;
2049 2050 2051 2052
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
2053 2054 2055 2056 2057 2058 2059 2060
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];

	if (level == 0)
		return 1;

2061
	mid = path->nodes[level];
2062
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2063

L
Li Zefan 已提交
2064
	if (level < BTRFS_MAX_LEVEL - 1) {
2065
		parent = path->nodes[level + 1];
L
Li Zefan 已提交
2066 2067
		pslot = path->slots[level + 1];
	}
2068

2069
	if (!parent)
2070 2071
		return 1;

2072
	left = btrfs_read_node_slot(parent, pslot - 1);
2073 2074
	if (IS_ERR(left))
		left = NULL;
2075 2076

	/* first, try to make some room in the middle buffer */
2077
	if (left) {
2078
		u32 left_nr;
2079

2080
		__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2081
		btrfs_set_lock_blocking_write(left);
2082

2083
		left_nr = btrfs_header_nritems(left);
2084
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
C
Chris Mason 已提交
2085 2086
			wret = 1;
		} else {
2087
			ret = btrfs_cow_block(trans, root, left, parent,
2088
					      pslot - 1, &left,
2089
					      BTRFS_NESTING_LEFT_COW);
2090 2091 2092
			if (ret)
				wret = 1;
			else {
2093
				wret = push_node_left(trans, left, mid, 0);
2094
			}
C
Chris Mason 已提交
2095
		}
2096 2097 2098
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
2099
			struct btrfs_disk_key disk_key;
2100
			orig_slot += left_nr;
2101
			btrfs_node_key(mid, &disk_key, 0);
2102 2103 2104
			ret = tree_mod_log_insert_key(parent, pslot,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
2105 2106 2107 2108
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
2109 2110
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
2111
				btrfs_tree_unlock(mid);
2112
				free_extent_buffer(mid);
2113 2114
			} else {
				orig_slot -=
2115
					btrfs_header_nritems(left);
2116
				path->slots[level] = orig_slot;
2117
				btrfs_tree_unlock(left);
2118
				free_extent_buffer(left);
2119 2120 2121
			}
			return 0;
		}
2122
		btrfs_tree_unlock(left);
2123
		free_extent_buffer(left);
2124
	}
2125
	right = btrfs_read_node_slot(parent, pslot + 1);
2126 2127
	if (IS_ERR(right))
		right = NULL;
2128 2129 2130 2131

	/*
	 * then try to empty the right most buffer into the middle
	 */
2132
	if (right) {
C
Chris Mason 已提交
2133
		u32 right_nr;
2134

2135
		__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2136
		btrfs_set_lock_blocking_write(right);
2137

2138
		right_nr = btrfs_header_nritems(right);
2139
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
C
Chris Mason 已提交
2140 2141
			wret = 1;
		} else {
2142 2143
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
2144
					      &right, BTRFS_NESTING_RIGHT_COW);
2145 2146 2147
			if (ret)
				wret = 1;
			else {
2148
				wret = balance_node_right(trans, right, mid);
2149
			}
C
Chris Mason 已提交
2150
		}
2151 2152 2153
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
2154 2155 2156
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
2157 2158 2159
			ret = tree_mod_log_insert_key(parent, pslot + 1,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
2160 2161 2162 2163 2164
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
2165 2166
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
2167
					btrfs_header_nritems(mid);
2168
				btrfs_tree_unlock(mid);
2169
				free_extent_buffer(mid);
2170
			} else {
2171
				btrfs_tree_unlock(right);
2172
				free_extent_buffer(right);
2173 2174 2175
			}
			return 0;
		}
2176
		btrfs_tree_unlock(right);
2177
		free_extent_buffer(right);
2178 2179 2180 2181
	}
	return 1;
}

2182
/*
C
Chris Mason 已提交
2183 2184
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
2185
 */
2186
static void reada_for_search(struct btrfs_fs_info *fs_info,
2187 2188
			     struct btrfs_path *path,
			     int level, int slot, u64 objectid)
2189
{
2190
	struct extent_buffer *node;
2191
	struct btrfs_disk_key disk_key;
2192 2193
	u32 nritems;
	u64 search;
2194
	u64 target;
2195
	u64 nread = 0;
2196
	struct extent_buffer *eb;
2197 2198 2199
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
2200

2201
	if (level != 1)
2202 2203 2204
		return;

	if (!path->nodes[level])
2205 2206
		return;

2207
	node = path->nodes[level];
2208

2209
	search = btrfs_node_blockptr(node, slot);
2210 2211
	blocksize = fs_info->nodesize;
	eb = find_extent_buffer(fs_info, search);
2212 2213
	if (eb) {
		free_extent_buffer(eb);
2214 2215 2216
		return;
	}

2217
	target = search;
2218

2219
	nritems = btrfs_header_nritems(node);
2220
	nr = slot;
2221

C
Chris Mason 已提交
2222
	while (1) {
2223
		if (path->reada == READA_BACK) {
2224 2225 2226
			if (nr == 0)
				break;
			nr--;
2227
		} else if (path->reada == READA_FORWARD) {
2228 2229 2230
			nr++;
			if (nr >= nritems)
				break;
2231
		}
2232
		if (path->reada == READA_BACK && objectid) {
2233 2234 2235 2236
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
2237
		search = btrfs_node_blockptr(node, nr);
2238 2239
		if ((search <= target && target - search <= 65536) ||
		    (search > target && search - target <= 65536)) {
2240
			readahead_tree_block(fs_info, search);
2241 2242 2243
			nread += blocksize;
		}
		nscan++;
2244
		if ((nread > 65536 || nscan > 32))
2245
			break;
2246 2247
	}
}
2248

2249
static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
J
Josef Bacik 已提交
2250
				       struct btrfs_path *path, int level)
2251 2252 2253 2254 2255 2256 2257 2258 2259
{
	int slot;
	int nritems;
	struct extent_buffer *parent;
	struct extent_buffer *eb;
	u64 gen;
	u64 block1 = 0;
	u64 block2 = 0;

2260
	parent = path->nodes[level + 1];
2261
	if (!parent)
J
Josef Bacik 已提交
2262
		return;
2263 2264

	nritems = btrfs_header_nritems(parent);
2265
	slot = path->slots[level + 1];
2266 2267 2268 2269

	if (slot > 0) {
		block1 = btrfs_node_blockptr(parent, slot - 1);
		gen = btrfs_node_ptr_generation(parent, slot - 1);
2270
		eb = find_extent_buffer(fs_info, block1);
2271 2272 2273 2274 2275 2276
		/*
		 * if we get -eagain from btrfs_buffer_uptodate, we
		 * don't want to return eagain here.  That will loop
		 * forever
		 */
		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2277 2278 2279
			block1 = 0;
		free_extent_buffer(eb);
	}
2280
	if (slot + 1 < nritems) {
2281 2282
		block2 = btrfs_node_blockptr(parent, slot + 1);
		gen = btrfs_node_ptr_generation(parent, slot + 1);
2283
		eb = find_extent_buffer(fs_info, block2);
2284
		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2285 2286 2287
			block2 = 0;
		free_extent_buffer(eb);
	}
2288

J
Josef Bacik 已提交
2289
	if (block1)
2290
		readahead_tree_block(fs_info, block1);
J
Josef Bacik 已提交
2291
	if (block2)
2292
		readahead_tree_block(fs_info, block2);
2293 2294 2295
}


C
Chris Mason 已提交
2296
/*
C
Chris Mason 已提交
2297 2298 2299 2300
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
2301
 *
C
Chris Mason 已提交
2302 2303 2304
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
2305
 *
C
Chris Mason 已提交
2306 2307
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
2308
 */
2309
static noinline void unlock_up(struct btrfs_path *path, int level,
2310 2311
			       int lowest_unlock, int min_write_lock_level,
			       int *write_lock_level)
2312 2313 2314
{
	int i;
	int skip_level = level;
2315
	int no_skips = 0;
2316 2317 2318 2319 2320 2321 2322
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
2323
		if (!no_skips && path->slots[i] == 0) {
2324 2325 2326
			skip_level = i + 1;
			continue;
		}
2327
		if (!no_skips && path->keep_locks) {
2328 2329 2330
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
2331
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2332 2333 2334 2335
				skip_level = i + 1;
				continue;
			}
		}
2336 2337 2338
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

2339
		t = path->nodes[i];
2340
		if (i >= lowest_unlock && i > skip_level) {
2341
			btrfs_tree_unlock_rw(t, path->locks[i]);
2342
			path->locks[i] = 0;
2343 2344 2345 2346 2347
			if (write_lock_level &&
			    i > min_write_lock_level &&
			    i <= *write_lock_level) {
				*write_lock_level = i - 1;
			}
2348 2349 2350 2351
		}
	}
}

2352 2353 2354 2355 2356 2357 2358 2359 2360
/*
 * helper function for btrfs_search_slot.  The goal is to find a block
 * in cache without setting the path to blocking.  If we find the block
 * we return zero and the path is unchanged.
 *
 * If we can't find the block, we set the path blocking and do some
 * reada.  -EAGAIN is returned and the search must be repeated.
 */
static int
2361 2362
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
		      struct extent_buffer **eb_ret, int level, int slot,
2363
		      const struct btrfs_key *key)
2364
{
2365
	struct btrfs_fs_info *fs_info = root->fs_info;
2366 2367 2368
	u64 blocknr;
	u64 gen;
	struct extent_buffer *tmp;
2369
	struct btrfs_key first_key;
2370
	int ret;
2371
	int parent_level;
2372

2373 2374 2375 2376
	blocknr = btrfs_node_blockptr(*eb_ret, slot);
	gen = btrfs_node_ptr_generation(*eb_ret, slot);
	parent_level = btrfs_header_level(*eb_ret);
	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2377

2378
	tmp = find_extent_buffer(fs_info, blocknr);
2379
	if (tmp) {
2380
		/* first we do an atomic uptodate check */
2381
		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2382 2383 2384 2385 2386
			/*
			 * Do extra check for first_key, eb can be stale due to
			 * being cached, read from scrub, or have multiple
			 * parents (shared tree blocks).
			 */
2387
			if (btrfs_verify_level_key(tmp,
2388 2389 2390 2391
					parent_level - 1, &first_key, gen)) {
				free_extent_buffer(tmp);
				return -EUCLEAN;
			}
2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
			*eb_ret = tmp;
			return 0;
		}

		/* the pages were up to date, but we failed
		 * the generation number check.  Do a full
		 * read for the generation number that is correct.
		 * We must do this without dropping locks so
		 * we can trust our generation number
		 */
		btrfs_set_path_blocking(p);

		/* now we're allowed to do a blocking uptodate check */
2405
		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2406 2407 2408
		if (!ret) {
			*eb_ret = tmp;
			return 0;
2409
		}
2410 2411 2412
		free_extent_buffer(tmp);
		btrfs_release_path(p);
		return -EIO;
2413 2414 2415 2416 2417
	}

	/*
	 * reduce lock contention at high levels
	 * of the btree by dropping locks before
2418 2419 2420
	 * we read.  Don't release the lock on the current
	 * level because we need to walk this node to figure
	 * out which blocks to read.
2421
	 */
2422 2423 2424
	btrfs_unlock_up_safe(p, level + 1);
	btrfs_set_path_blocking(p);

2425
	if (p->reada != READA_NONE)
2426
		reada_for_search(fs_info, p, level, slot, key->objectid);
2427

2428
	ret = -EAGAIN;
2429
	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2430
			      &first_key);
2431
	if (!IS_ERR(tmp)) {
2432 2433 2434 2435 2436 2437
		/*
		 * If the read above didn't mark this buffer up to date,
		 * it will never end up being up to date.  Set ret to EIO now
		 * and give up so that our caller doesn't loop forever
		 * on our EAGAINs.
		 */
2438
		if (!extent_buffer_uptodate(tmp))
2439
			ret = -EIO;
2440
		free_extent_buffer(tmp);
2441 2442
	} else {
		ret = PTR_ERR(tmp);
2443
	}
2444 2445

	btrfs_release_path(p);
2446
	return ret;
2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460
}

/*
 * helper function for btrfs_search_slot.  This does all of the checks
 * for node-level blocks and does any balancing required based on
 * the ins_len.
 *
 * If no extra work was required, zero is returned.  If we had to
 * drop the path, -EAGAIN is returned and btrfs_search_slot must
 * start over
 */
static int
setup_nodes_for_search(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root, struct btrfs_path *p,
2461 2462
		       struct extent_buffer *b, int level, int ins_len,
		       int *write_lock_level)
2463
{
2464
	struct btrfs_fs_info *fs_info = root->fs_info;
2465
	int ret;
2466

2467
	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2468
	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2469 2470
		int sret;

2471 2472 2473 2474 2475 2476
		if (*write_lock_level < level + 1) {
			*write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

2477
		btrfs_set_path_blocking(p);
2478
		reada_for_balance(fs_info, p, level);
2479 2480 2481 2482 2483 2484 2485 2486 2487
		sret = split_node(trans, root, p, level);

		BUG_ON(sret > 0);
		if (sret) {
			ret = sret;
			goto done;
		}
		b = p->nodes[level];
	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2488
		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2489 2490
		int sret;

2491 2492 2493 2494 2495 2496
		if (*write_lock_level < level + 1) {
			*write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

2497
		btrfs_set_path_blocking(p);
2498
		reada_for_balance(fs_info, p, level);
2499 2500 2501 2502 2503 2504 2505 2506
		sret = balance_level(trans, root, p, level);

		if (sret) {
			ret = sret;
			goto done;
		}
		b = p->nodes[level];
		if (!b) {
2507
			btrfs_release_path(p);
2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519
			goto again;
		}
		BUG_ON(btrfs_header_nritems(b) == 1);
	}
	return 0;

again:
	ret = -EAGAIN;
done:
	return ret;
}

2520
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2521 2522 2523 2524 2525 2526
		u64 iobjectid, u64 ioff, u8 key_type,
		struct btrfs_key *found_key)
{
	int ret;
	struct btrfs_key key;
	struct extent_buffer *eb;
2527 2528

	ASSERT(path);
2529
	ASSERT(found_key);
2530 2531 2532 2533 2534 2535

	key.type = key_type;
	key.objectid = iobjectid;
	key.offset = ioff;

	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2536
	if (ret < 0)
2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554
		return ret;

	eb = path->nodes[0];
	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
		ret = btrfs_next_leaf(fs_root, path);
		if (ret)
			return ret;
		eb = path->nodes[0];
	}

	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
	if (found_key->type != key.type ||
			found_key->objectid != key.objectid)
		return 1;

	return 0;
}

2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567
static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
							struct btrfs_path *p,
							int write_lock_level)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *b;
	int root_lock;
	int level = 0;

	/* We try very hard to do read locks on the root */
	root_lock = BTRFS_READ_LOCK;

	if (p->search_commit_root) {
2568 2569 2570 2571 2572 2573 2574 2575 2576 2577
		/*
		 * The commit roots are read only so we always do read locks,
		 * and we always must hold the commit_root_sem when doing
		 * searches on them, the only exception is send where we don't
		 * want to block transaction commits for a long time, so
		 * we need to clone the commit root in order to avoid races
		 * with transaction commits that create a snapshot of one of
		 * the roots used by a send operation.
		 */
		if (p->need_commit_sem) {
2578
			down_read(&fs_info->commit_root_sem);
2579
			b = btrfs_clone_extent_buffer(root->commit_root);
2580
			up_read(&fs_info->commit_root_sem);
2581 2582 2583 2584 2585
			if (!b)
				return ERR_PTR(-ENOMEM);

		} else {
			b = root->commit_root;
D
David Sterba 已提交
2586
			atomic_inc(&b->refs);
2587 2588
		}
		level = btrfs_header_level(b);
2589 2590 2591 2592 2593
		/*
		 * Ensure that all callers have set skip_locking when
		 * p->search_commit_root = 1.
		 */
		ASSERT(p->skip_locking == 1);
2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604

		goto out;
	}

	if (p->skip_locking) {
		b = btrfs_root_node(root);
		level = btrfs_header_level(b);
		goto out;
	}

	/*
2605 2606
	 * If the level is set to maximum, we can skip trying to get the read
	 * lock.
2607
	 */
2608 2609 2610 2611 2612
	if (write_lock_level < BTRFS_MAX_LEVEL) {
		/*
		 * We don't know the level of the root node until we actually
		 * have it read locked
		 */
J
Josef Bacik 已提交
2613
		b = __btrfs_read_lock_root_node(root, p->recurse);
2614 2615 2616 2617 2618 2619 2620 2621
		level = btrfs_header_level(b);
		if (level > write_lock_level)
			goto out;

		/* Whoops, must trade for write lock */
		btrfs_tree_read_unlock(b);
		free_extent_buffer(b);
	}
2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639

	b = btrfs_lock_root_node(root);
	root_lock = BTRFS_WRITE_LOCK;

	/* The level might have changed, check again */
	level = btrfs_header_level(b);

out:
	p->nodes[level] = b;
	if (!p->skip_locking)
		p->locks[level] = root_lock;
	/*
	 * Callers are responsible for dropping b's references.
	 */
	return b;
}


C
Chris Mason 已提交
2640
/*
2641 2642
 * btrfs_search_slot - look for a key in a tree and perform necessary
 * modifications to preserve tree invariants.
C
Chris Mason 已提交
2643
 *
2644 2645 2646 2647 2648 2649 2650 2651
 * @trans:	Handle of transaction, used when modifying the tree
 * @p:		Holds all btree nodes along the search path
 * @root:	The root node of the tree
 * @key:	The key we are looking for
 * @ins_len:	Indicates purpose of search, for inserts it is 1, for
 *		deletions it's -1. 0 for plain searches
 * @cow:	boolean should CoW operations be performed. Must always be 1
 *		when modifying the tree.
C
Chris Mason 已提交
2652
 *
2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663
 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
 *
 * If @key is found, 0 is returned and you can find the item in the leaf level
 * of the path (level 0)
 *
 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
 * points to the slot where it should be inserted
 *
 * If an error is encountered while searching the tree a negative error number
 * is returned
C
Chris Mason 已提交
2664
 */
2665 2666 2667
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *key, struct btrfs_path *p,
		      int ins_len, int cow)
2668
{
2669
	struct extent_buffer *b;
2670 2671
	int slot;
	int ret;
2672
	int err;
2673
	int level;
2674
	int lowest_unlock = 1;
2675 2676
	/* everything at write_lock_level or lower must be write locked */
	int write_lock_level = 0;
2677
	u8 lowest_level = 0;
2678
	int min_write_lock_level;
2679
	int prev_cmp;
2680

2681
	lowest_level = p->lowest_level;
2682
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
2683
	WARN_ON(p->nodes[0] != NULL);
2684
	BUG_ON(!cow && ins_len);
2685

2686
	if (ins_len < 0) {
2687
		lowest_unlock = 2;
2688

2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704
		/* when we are removing items, we might have to go up to level
		 * two as we update tree pointers  Make sure we keep write
		 * for those levels as well
		 */
		write_lock_level = 2;
	} else if (ins_len > 0) {
		/*
		 * for inserting items, make sure we have a write lock on
		 * level 1 so we can update keys
		 */
		write_lock_level = 1;
	}

	if (!cow)
		write_lock_level = -1;

J
Josef Bacik 已提交
2705
	if (cow && (p->keep_locks || p->lowest_level))
2706 2707
		write_lock_level = BTRFS_MAX_LEVEL;

2708 2709
	min_write_lock_level = write_lock_level;

2710
again:
2711
	prev_cmp = -1;
2712
	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2713 2714 2715 2716
	if (IS_ERR(b)) {
		ret = PTR_ERR(b);
		goto done;
	}
2717

2718
	while (b) {
2719 2720
		int dec = 0;

2721
		level = btrfs_header_level(b);
2722

C
Chris Mason 已提交
2723
		if (cow) {
2724 2725
			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));

2726 2727 2728 2729 2730
			/*
			 * if we don't really need to cow this block
			 * then we don't want to set the path blocking,
			 * so we test it here
			 */
2731 2732
			if (!should_cow_block(trans, root, b)) {
				trans->dirty = true;
2733
				goto cow_done;
2734
			}
2735

2736 2737 2738 2739
			/*
			 * must have write locks on this node and the
			 * parent
			 */
2740 2741 2742 2743
			if (level > write_lock_level ||
			    (level + 1 > write_lock_level &&
			    level + 1 < BTRFS_MAX_LEVEL &&
			    p->nodes[level + 1])) {
2744 2745 2746 2747 2748
				write_lock_level = level + 1;
				btrfs_release_path(p);
				goto again;
			}

2749
			btrfs_set_path_blocking(p);
2750 2751
			if (last_level)
				err = btrfs_cow_block(trans, root, b, NULL, 0,
2752 2753
						      &b,
						      BTRFS_NESTING_COW);
2754 2755 2756
			else
				err = btrfs_cow_block(trans, root, b,
						      p->nodes[level + 1],
2757 2758
						      p->slots[level + 1], &b,
						      BTRFS_NESTING_COW);
2759 2760
			if (err) {
				ret = err;
2761
				goto done;
2762
			}
C
Chris Mason 已提交
2763
		}
2764
cow_done:
2765
		p->nodes[level] = b;
L
Liu Bo 已提交
2766 2767 2768 2769
		/*
		 * Leave path with blocking locks to avoid massive
		 * lock context switch, this is made on purpose.
		 */
2770 2771 2772 2773 2774 2775 2776

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 *
2777 2778 2779 2780
		 * If we're inserting or deleting (ins_len != 0), then we might
		 * be changing slot zero, which may require changing the parent.
		 * So, we can't drop the lock until after we know which slot
		 * we're operating on.
2781
		 */
2782 2783 2784 2785 2786 2787 2788 2789
		if (!ins_len && !p->keep_locks) {
			int u = level + 1;

			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
				p->locks[u] = 0;
			}
		}
2790

N
Nikolay Borisov 已提交
2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807
		/*
		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
		 * we can safely assume the target key will always be in slot 0
		 * on lower levels due to the invariants BTRFS' btree provides,
		 * namely that a btrfs_key_ptr entry always points to the
		 * lowest key in the child node, thus we can skip searching
		 * lower levels
		 */
		if (prev_cmp == 0) {
			slot = 0;
			ret = 0;
		} else {
			ret = btrfs_bin_search(b, key, &slot);
			prev_cmp = ret;
			if (ret < 0)
				goto done;
		}
2808

2809
		if (level == 0) {
2810
			p->slots[level] = slot;
2811
			if (ins_len > 0 &&
2812
			    btrfs_leaf_free_space(b) < ins_len) {
2813 2814 2815 2816 2817 2818
				if (write_lock_level < 1) {
					write_lock_level = 1;
					btrfs_release_path(p);
					goto again;
				}

2819
				btrfs_set_path_blocking(p);
2820 2821
				err = split_leaf(trans, root, key,
						 p, ins_len, ret == 0);
2822

2823 2824 2825
				BUG_ON(err > 0);
				if (err) {
					ret = err;
2826 2827
					goto done;
				}
C
Chris Mason 已提交
2828
			}
2829
			if (!p->search_for_split)
2830
				unlock_up(p, level, lowest_unlock,
2831
					  min_write_lock_level, NULL);
2832
			goto done;
2833
		}
2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888
		if (ret && slot > 0) {
			dec = 1;
			slot--;
		}
		p->slots[level] = slot;
		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
					     &write_lock_level);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
			goto done;
		}
		b = p->nodes[level];
		slot = p->slots[level];

		/*
		 * Slot 0 is special, if we change the key we have to update
		 * the parent pointer which means we must have a write lock on
		 * the parent
		 */
		if (slot == 0 && ins_len && write_lock_level < level + 1) {
			write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

		unlock_up(p, level, lowest_unlock, min_write_lock_level,
			  &write_lock_level);

		if (level == lowest_level) {
			if (dec)
				p->slots[level]++;
			goto done;
		}

		err = read_block_for_search(root, p, &b, level, slot, key);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
			goto done;
		}

		if (!p->skip_locking) {
			level = btrfs_header_level(b);
			if (level <= write_lock_level) {
				if (!btrfs_try_tree_write_lock(b)) {
					btrfs_set_path_blocking(p);
					btrfs_tree_lock(b);
				}
				p->locks[level] = BTRFS_WRITE_LOCK;
			} else {
				if (!btrfs_tree_read_lock_atomic(b)) {
					btrfs_set_path_blocking(p);
2889 2890
					__btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
							       p->recurse);
2891 2892 2893 2894 2895
				}
				p->locks[level] = BTRFS_READ_LOCK;
			}
			p->nodes[level] = b;
		}
2896
	}
2897 2898
	ret = 1;
done:
2899 2900 2901 2902
	/*
	 * we don't really know what they plan on doing with the path
	 * from here on, so for now just mark it as blocking
	 */
2903 2904
	if (!p->leave_spinning)
		btrfs_set_path_blocking(p);
2905
	if (ret < 0 && !p->skip_release_on_error)
2906
		btrfs_release_path(p);
2907
	return ret;
2908 2909
}

J
Jan Schmidt 已提交
2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920
/*
 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
 * current state of the tree together with the operations recorded in the tree
 * modification log to search for the key in a previous version of this tree, as
 * denoted by the time_seq parameter.
 *
 * Naturally, there is no support for insert, delete or cow operations.
 *
 * The resulting path and return value will be set up as if we called
 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
 */
2921
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
J
Jan Schmidt 已提交
2922 2923
			  struct btrfs_path *p, u64 time_seq)
{
2924
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Jan Schmidt 已提交
2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942
	struct extent_buffer *b;
	int slot;
	int ret;
	int err;
	int level;
	int lowest_unlock = 1;
	u8 lowest_level = 0;

	lowest_level = p->lowest_level;
	WARN_ON(p->nodes[0] != NULL);

	if (p->search_commit_root) {
		BUG_ON(time_seq);
		return btrfs_search_slot(NULL, root, key, p, 0, 0);
	}

again:
	b = get_old_root(root, time_seq);
2943 2944 2945 2946
	if (!b) {
		ret = -EIO;
		goto done;
	}
J
Jan Schmidt 已提交
2947 2948 2949 2950
	level = btrfs_header_level(b);
	p->locks[level] = BTRFS_READ_LOCK;

	while (b) {
2951 2952
		int dec = 0;

J
Jan Schmidt 已提交
2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963
		level = btrfs_header_level(b);
		p->nodes[level] = b;

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 */
		btrfs_unlock_up_safe(p, level + 1);

N
Nikolay Borisov 已提交
2964
		ret = btrfs_bin_search(b, key, &slot);
2965 2966
		if (ret < 0)
			goto done;
J
Jan Schmidt 已提交
2967

2968
		if (level == 0) {
J
Jan Schmidt 已提交
2969 2970
			p->slots[level] = slot;
			unlock_up(p, level, lowest_unlock, 0, NULL);
2971 2972
			goto done;
		}
J
Jan Schmidt 已提交
2973

2974 2975 2976 2977 2978 2979
		if (ret && slot > 0) {
			dec = 1;
			slot--;
		}
		p->slots[level] = slot;
		unlock_up(p, level, lowest_unlock, 0, NULL);
J
Jan Schmidt 已提交
2980

2981 2982 2983 2984 2985
		if (level == lowest_level) {
			if (dec)
				p->slots[level]++;
			goto done;
		}
J
Jan Schmidt 已提交
2986

2987 2988 2989 2990 2991
		err = read_block_for_search(root, p, &b, level, slot, key);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
J
Jan Schmidt 已提交
2992 2993
			goto done;
		}
2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006

		level = btrfs_header_level(b);
		if (!btrfs_tree_read_lock_atomic(b)) {
			btrfs_set_path_blocking(p);
			btrfs_tree_read_lock(b);
		}
		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
		if (!b) {
			ret = -ENOMEM;
			goto done;
		}
		p->locks[level] = BTRFS_READ_LOCK;
		p->nodes[level] = b;
J
Jan Schmidt 已提交
3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017
	}
	ret = 1;
done:
	if (!p->leave_spinning)
		btrfs_set_path_blocking(p);
	if (ret < 0)
		btrfs_release_path(p);

	return ret;
}

3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030
/*
 * helper to use instead of search slot if no exact match is needed but
 * instead the next or previous item should be returned.
 * When find_higher is true, the next higher item is returned, the next lower
 * otherwise.
 * When return_any and find_higher are both true, and no higher item is found,
 * return the next lower instead.
 * When return_any is true and find_higher is false, and no lower item is found,
 * return the next higher instead.
 * It returns 0 if any item is found, 1 if none is found (tree empty), and
 * < 0 on error
 */
int btrfs_search_slot_for_read(struct btrfs_root *root,
3031 3032 3033
			       const struct btrfs_key *key,
			       struct btrfs_path *p, int find_higher,
			       int return_any)
3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067
{
	int ret;
	struct extent_buffer *leaf;

again:
	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
	if (ret <= 0)
		return ret;
	/*
	 * a return value of 1 means the path is at the position where the
	 * item should be inserted. Normally this is the next bigger item,
	 * but in case the previous item is the last in a leaf, path points
	 * to the first free slot in the previous leaf, i.e. at an invalid
	 * item.
	 */
	leaf = p->nodes[0];

	if (find_higher) {
		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, p);
			if (ret <= 0)
				return ret;
			if (!return_any)
				return 1;
			/*
			 * no higher item found, return the next
			 * lower instead
			 */
			return_any = 0;
			find_higher = 0;
			btrfs_release_path(p);
			goto again;
		}
	} else {
3068 3069 3070 3071 3072
		if (p->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, p);
			if (ret < 0)
				return ret;
			if (!ret) {
3073 3074 3075
				leaf = p->nodes[0];
				if (p->slots[0] == btrfs_header_nritems(leaf))
					p->slots[0]--;
3076
				return 0;
3077
			}
3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088
			if (!return_any)
				return 1;
			/*
			 * no lower item found, return the next
			 * higher instead
			 */
			return_any = 0;
			find_higher = 1;
			btrfs_release_path(p);
			goto again;
		} else {
3089 3090 3091 3092 3093 3094
			--p->slots[0];
		}
	}
	return 0;
}

C
Chris Mason 已提交
3095 3096 3097 3098 3099 3100
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
3101
 *
C
Chris Mason 已提交
3102
 */
3103
static void fixup_low_keys(struct btrfs_path *path,
3104
			   struct btrfs_disk_key *key, int level)
3105 3106
{
	int i;
3107
	struct extent_buffer *t;
3108
	int ret;
3109

C
Chris Mason 已提交
3110
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3111
		int tslot = path->slots[i];
3112

3113
		if (!path->nodes[i])
3114
			break;
3115
		t = path->nodes[i];
3116 3117 3118
		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
				GFP_ATOMIC);
		BUG_ON(ret < 0);
3119
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
3120
		btrfs_mark_buffer_dirty(path->nodes[i]);
3121 3122 3123 3124 3125
		if (tslot != 0)
			break;
	}
}

Z
Zheng Yan 已提交
3126 3127 3128 3129 3130 3131
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
3132 3133
void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
			     struct btrfs_path *path,
3134
			     const struct btrfs_key *new_key)
Z
Zheng Yan 已提交
3135 3136 3137 3138 3139 3140 3141 3142 3143
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154
		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
			btrfs_crit(fs_info,
		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
				   slot, btrfs_disk_key_objectid(&disk_key),
				   btrfs_disk_key_type(&disk_key),
				   btrfs_disk_key_offset(&disk_key),
				   new_key->objectid, new_key->type,
				   new_key->offset);
			btrfs_print_leaf(eb);
			BUG();
		}
Z
Zheng Yan 已提交
3155 3156 3157
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168
		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
			btrfs_crit(fs_info,
		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
				   slot, btrfs_disk_key_objectid(&disk_key),
				   btrfs_disk_key_type(&disk_key),
				   btrfs_disk_key_offset(&disk_key),
				   new_key->objectid, new_key->type,
				   new_key->offset);
			btrfs_print_leaf(eb);
			BUG();
		}
Z
Zheng Yan 已提交
3169 3170 3171 3172 3173 3174
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
3175
		fixup_low_keys(path, &disk_key, 1);
Z
Zheng Yan 已提交
3176 3177
}

3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229
/*
 * Check key order of two sibling extent buffers.
 *
 * Return true if something is wrong.
 * Return false if everything is fine.
 *
 * Tree-checker only works inside one tree block, thus the following
 * corruption can not be detected by tree-checker:
 *
 * Leaf @left			| Leaf @right
 * --------------------------------------------------------------
 * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
 *
 * Key f6 in leaf @left itself is valid, but not valid when the next
 * key in leaf @right is 7.
 * This can only be checked at tree block merge time.
 * And since tree checker has ensured all key order in each tree block
 * is correct, we only need to bother the last key of @left and the first
 * key of @right.
 */
static bool check_sibling_keys(struct extent_buffer *left,
			       struct extent_buffer *right)
{
	struct btrfs_key left_last;
	struct btrfs_key right_first;
	int level = btrfs_header_level(left);
	int nr_left = btrfs_header_nritems(left);
	int nr_right = btrfs_header_nritems(right);

	/* No key to check in one of the tree blocks */
	if (!nr_left || !nr_right)
		return false;

	if (level) {
		btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
		btrfs_node_key_to_cpu(right, &right_first, 0);
	} else {
		btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
		btrfs_item_key_to_cpu(right, &right_first, 0);
	}

	if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
		btrfs_crit(left->fs_info,
"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
			   left_last.objectid, left_last.type,
			   left_last.offset, right_first.objectid,
			   right_first.type, right_first.offset);
		return true;
	}
	return false;
}

C
Chris Mason 已提交
3230 3231
/*
 * try to push data from one node into the next node left in the
3232
 * tree.
C
Chris Mason 已提交
3233 3234 3235
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
3236
 */
3237
static int push_node_left(struct btrfs_trans_handle *trans,
3238
			  struct extent_buffer *dst,
3239
			  struct extent_buffer *src, int empty)
3240
{
3241
	struct btrfs_fs_info *fs_info = trans->fs_info;
3242
	int push_items = 0;
3243 3244
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
3245
	int ret = 0;
3246

3247 3248
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
3249
	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3250 3251
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3252

3253
	if (!empty && src_nritems <= 8)
3254 3255
		return 1;

C
Chris Mason 已提交
3256
	if (push_items <= 0)
3257 3258
		return 1;

3259
	if (empty) {
3260
		push_items = min(src_nritems, push_items);
3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
3273

3274 3275 3276 3277 3278 3279
	/* dst is the left eb, src is the middle eb */
	if (check_sibling_keys(dst, src)) {
		ret = -EUCLEAN;
		btrfs_abort_transaction(trans, ret);
		return ret;
	}
3280
	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3281
	if (ret) {
3282
		btrfs_abort_transaction(trans, ret);
3283 3284
		return ret;
	}
3285 3286 3287
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
3288
			   push_items * sizeof(struct btrfs_key_ptr));
3289

3290
	if (push_items < src_nritems) {
3291
		/*
3292 3293
		 * Don't call tree_mod_log_insert_move here, key removal was
		 * already fully logged by tree_mod_log_eb_copy above.
3294
		 */
3295 3296 3297 3298 3299 3300 3301 3302 3303
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
3304

3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
3317 3318 3319
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
3320
{
3321
	struct btrfs_fs_info *fs_info = trans->fs_info;
3322 3323 3324 3325 3326 3327
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

3328 3329 3330
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

3331 3332
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
3333
	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
C
Chris Mason 已提交
3334
	if (push_items <= 0)
3335
		return 1;
3336

C
Chris Mason 已提交
3337
	if (src_nritems < 4)
3338
		return 1;
3339 3340 3341

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
3342
	if (max_push >= src_nritems)
3343
		return 1;
Y
Yan 已提交
3344

3345 3346 3347
	if (max_push < push_items)
		push_items = max_push;

3348 3349 3350 3351 3352 3353
	/* dst is the right eb, src is the middle eb */
	if (check_sibling_keys(src, dst)) {
		ret = -EUCLEAN;
		btrfs_abort_transaction(trans, ret);
		return ret;
	}
3354 3355
	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
	BUG_ON(ret < 0);
3356 3357 3358 3359
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
3360

3361 3362
	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
				   push_items);
3363
	if (ret) {
3364
		btrfs_abort_transaction(trans, ret);
3365 3366
		return ret;
	}
3367 3368 3369
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
3370
			   push_items * sizeof(struct btrfs_key_ptr));
3371

3372 3373
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3374

3375 3376
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
3377

C
Chris Mason 已提交
3378
	return ret;
3379 3380
}

C
Chris Mason 已提交
3381 3382 3383 3384
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
3385 3386
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
3387
 */
C
Chris Mason 已提交
3388
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3389
			   struct btrfs_root *root,
3390
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
3391
{
3392
	struct btrfs_fs_info *fs_info = root->fs_info;
3393
	u64 lower_gen;
3394 3395
	struct extent_buffer *lower;
	struct extent_buffer *c;
3396
	struct extent_buffer *old;
3397
	struct btrfs_disk_key lower_key;
3398
	int ret;
C
Chris Mason 已提交
3399 3400 3401 3402

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

3403 3404 3405 3406 3407 3408
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

3409
	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3410
					 root->node->start, 0,
3411
					 BTRFS_NESTING_NEW_ROOT);
3412 3413
	if (IS_ERR(c))
		return PTR_ERR(c);
3414

3415
	root_add_used(root, fs_info->nodesize);
3416

3417 3418
	btrfs_set_header_nritems(c, 1);
	btrfs_set_node_key(c, &lower_key, 0);
3419
	btrfs_set_node_blockptr(c, 0, lower->start);
3420
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
3421
	WARN_ON(lower_gen != trans->transid);
3422 3423

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3424

3425
	btrfs_mark_buffer_dirty(c);
3426

3427
	old = root->node;
3428 3429
	ret = tree_mod_log_insert_root(root->node, c, 0);
	BUG_ON(ret < 0);
3430
	rcu_assign_pointer(root->node, c);
3431 3432 3433 3434

	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

3435
	add_root_to_dirty_list(root);
D
David Sterba 已提交
3436
	atomic_inc(&c->refs);
3437
	path->nodes[level] = c;
3438
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
C
Chris Mason 已提交
3439 3440 3441 3442
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
3443 3444 3445
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
3446
 *
C
Chris Mason 已提交
3447 3448 3449
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
 */
3450
static void insert_ptr(struct btrfs_trans_handle *trans,
3451
		       struct btrfs_path *path,
3452
		       struct btrfs_disk_key *key, u64 bytenr,
3453
		       int slot, int level)
C
Chris Mason 已提交
3454
{
3455
	struct extent_buffer *lower;
C
Chris Mason 已提交
3456
	int nritems;
3457
	int ret;
C
Chris Mason 已提交
3458 3459

	BUG_ON(!path->nodes[level]);
3460
	btrfs_assert_tree_locked(path->nodes[level]);
3461 3462
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
S
Stoyan Gaydarov 已提交
3463
	BUG_ON(slot > nritems);
3464
	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
C
Chris Mason 已提交
3465
	if (slot != nritems) {
3466 3467
		if (level) {
			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3468
					nritems - slot);
3469 3470
			BUG_ON(ret < 0);
		}
3471 3472 3473
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
3474
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
3475
	}
3476
	if (level) {
3477 3478
		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
				GFP_NOFS);
3479 3480
		BUG_ON(ret < 0);
	}
3481
	btrfs_set_node_key(lower, key, slot);
3482
	btrfs_set_node_blockptr(lower, slot, bytenr);
3483 3484
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3485 3486
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
3487 3488
}

C
Chris Mason 已提交
3489 3490 3491 3492 3493 3494
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
3495 3496
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
3497
 */
3498 3499 3500
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
3501
{
3502
	struct btrfs_fs_info *fs_info = root->fs_info;
3503 3504 3505
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
3506
	int mid;
C
Chris Mason 已提交
3507
	int ret;
3508
	u32 c_nritems;
3509

3510
	c = path->nodes[level];
3511
	WARN_ON(btrfs_header_generation(c) != trans->transid);
3512
	if (c == root->node) {
3513
		/*
3514 3515
		 * trying to split the root, lets make a new one
		 *
3516
		 * tree mod log: We don't log_removal old root in
3517 3518 3519 3520 3521
		 * insert_new_root, because that root buffer will be kept as a
		 * normal node. We are going to log removal of half of the
		 * elements below with tree_mod_log_eb_copy. We're holding a
		 * tree lock on the buffer, which is why we cannot race with
		 * other tree_mod_log users.
3522
		 */
3523
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
3524 3525
		if (ret)
			return ret;
3526
	} else {
3527
		ret = push_nodes_for_insert(trans, root, path, level);
3528 3529
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
3530
		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3531
			return 0;
3532 3533
		if (ret < 0)
			return ret;
3534
	}
3535

3536
	c_nritems = btrfs_header_nritems(c);
3537 3538
	mid = (c_nritems + 1) / 2;
	btrfs_node_key(c, &disk_key, mid);
3539

3540
	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3541
					     c->start, 0, BTRFS_NESTING_SPLIT);
3542 3543 3544
	if (IS_ERR(split))
		return PTR_ERR(split);

3545
	root_add_used(root, fs_info->nodesize);
3546
	ASSERT(btrfs_header_level(c) == level);
3547

3548
	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3549
	if (ret) {
3550
		btrfs_abort_transaction(trans, ret);
3551 3552
		return ret;
	}
3553 3554 3555 3556 3557 3558
	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
3559 3560
	ret = 0;

3561 3562 3563
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

3564
	insert_ptr(trans, path, &disk_key, split->start,
3565
		   path->slots[level + 1] + 1, level + 1);
C
Chris Mason 已提交
3566

C
Chris Mason 已提交
3567
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
3568
		path->slots[level] -= mid;
3569
		btrfs_tree_unlock(c);
3570 3571
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
3572 3573
		path->slots[level + 1] += 1;
	} else {
3574
		btrfs_tree_unlock(split);
3575
		free_extent_buffer(split);
3576
	}
C
Chris Mason 已提交
3577
	return ret;
3578 3579
}

C
Chris Mason 已提交
3580 3581 3582 3583 3584
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
3585
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3586
{
J
Josef Bacik 已提交
3587 3588
	struct btrfs_item *start_item;
	struct btrfs_item *end_item;
3589
	int data_len;
3590
	int nritems = btrfs_header_nritems(l);
3591
	int end = min(nritems, start + nr) - 1;
3592 3593 3594

	if (!nr)
		return 0;
3595 3596
	start_item = btrfs_item_nr(start);
	end_item = btrfs_item_nr(end);
3597 3598 3599
	data_len = btrfs_item_offset(l, start_item) +
		   btrfs_item_size(l, start_item);
	data_len = data_len - btrfs_item_offset(l, end_item);
C
Chris Mason 已提交
3600
	data_len += sizeof(struct btrfs_item) * nr;
3601
	WARN_ON(data_len < 0);
3602 3603 3604
	return data_len;
}

3605 3606 3607 3608 3609
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
3610
noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3611
{
3612
	struct btrfs_fs_info *fs_info = leaf->fs_info;
3613 3614
	int nritems = btrfs_header_nritems(leaf);
	int ret;
3615 3616

	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3617
	if (ret < 0) {
3618 3619 3620 3621 3622
		btrfs_crit(fs_info,
			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
			   ret,
			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
			   leaf_space_used(leaf, 0, nritems), nritems);
3623 3624
	}
	return ret;
3625 3626
}

3627 3628 3629 3630
/*
 * min slot controls the lowest index we're willing to push to the
 * right.  We'll push up to and including min_slot, but no lower
 */
3631
static noinline int __push_leaf_right(struct btrfs_path *path,
3632 3633
				      int data_size, int empty,
				      struct extent_buffer *right,
3634 3635
				      int free_space, u32 left_nritems,
				      u32 min_slot)
C
Chris Mason 已提交
3636
{
3637
	struct btrfs_fs_info *fs_info = right->fs_info;
3638
	struct extent_buffer *left = path->nodes[0];
3639
	struct extent_buffer *upper = path->nodes[1];
3640
	struct btrfs_map_token token;
3641
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
3642
	int slot;
3643
	u32 i;
C
Chris Mason 已提交
3644 3645
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
3646
	struct btrfs_item *item;
3647
	u32 nr;
3648
	u32 right_nritems;
3649
	u32 data_end;
3650
	u32 this_item_size;
C
Chris Mason 已提交
3651

3652 3653 3654
	if (empty)
		nr = 0;
	else
3655
		nr = max_t(u32, 1, min_slot);
3656

Z
Zheng Yan 已提交
3657
	if (path->slots[0] >= left_nritems)
3658
		push_space += data_size;
Z
Zheng Yan 已提交
3659

3660
	slot = path->slots[1];
3661 3662
	i = left_nritems - 1;
	while (i >= nr) {
3663
		item = btrfs_item_nr(i);
3664

Z
Zheng Yan 已提交
3665 3666 3667 3668
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
3669 3670
				int space = btrfs_leaf_free_space(left);

Z
Zheng Yan 已提交
3671 3672 3673 3674 3675
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
3676
		if (path->slots[0] == i)
3677
			push_space += data_size;
3678 3679 3680

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
3681
			break;
Z
Zheng Yan 已提交
3682

C
Chris Mason 已提交
3683
		push_items++;
3684
		push_space += this_item_size + sizeof(*item);
3685 3686 3687
		if (i == 0)
			break;
		i--;
3688
	}
3689

3690 3691
	if (push_items == 0)
		goto out_unlock;
3692

J
Julia Lawall 已提交
3693
	WARN_ON(!empty && push_items == left_nritems);
3694

C
Chris Mason 已提交
3695
	/* push left to right */
3696
	right_nritems = btrfs_header_nritems(right);
3697

3698
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3699
	push_space -= leaf_data_end(left);
3700

C
Chris Mason 已提交
3701
	/* make room in the right data area */
3702
	data_end = leaf_data_end(right);
3703
	memmove_extent_buffer(right,
3704 3705
			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
			      BTRFS_LEAF_DATA_OFFSET + data_end,
3706
			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3707

C
Chris Mason 已提交
3708
	/* copy from the left data area */
3709
	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3710
		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3711
		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
C
Chris Mason 已提交
3712
		     push_space);
3713 3714 3715 3716 3717

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
3718
	/* copy the items from left to right */
3719 3720 3721
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
3722 3723

	/* update the item pointers */
3724
	btrfs_init_map_token(&token, right);
3725
	right_nritems += push_items;
3726
	btrfs_set_header_nritems(right, right_nritems);
3727
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3728
	for (i = 0; i < right_nritems; i++) {
3729
		item = btrfs_item_nr(i);
3730 3731
		push_space -= btrfs_token_item_size(&token, item);
		btrfs_set_token_item_offset(&token, item, push_space);
3732 3733
	}

3734
	left_nritems -= push_items;
3735
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
3736

3737 3738
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
3739
	else
3740
		btrfs_clean_tree_block(left);
3741

3742
	btrfs_mark_buffer_dirty(right);
3743

3744 3745
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
3746
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
3747

C
Chris Mason 已提交
3748
	/* then fixup the leaf pointer in the path */
3749 3750
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
3751
		if (btrfs_header_nritems(path->nodes[0]) == 0)
3752
			btrfs_clean_tree_block(path->nodes[0]);
3753
		btrfs_tree_unlock(path->nodes[0]);
3754 3755
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
3756 3757
		path->slots[1] += 1;
	} else {
3758
		btrfs_tree_unlock(right);
3759
		free_extent_buffer(right);
C
Chris Mason 已提交
3760 3761
	}
	return 0;
3762 3763 3764 3765 3766

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
3767
}
3768

3769 3770 3771 3772 3773 3774
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
3775 3776 3777
 *
 * this will push starting from min_slot to the end of the leaf.  It won't
 * push any slot lower than min_slot
3778 3779
 */
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3780 3781 3782
			   *root, struct btrfs_path *path,
			   int min_data_size, int data_size,
			   int empty, u32 min_slot)
3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801
{
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	int slot;
	int free_space;
	u32 left_nritems;
	int ret;

	if (!path->nodes[1])
		return 1;

	slot = path->slots[1];
	upper = path->nodes[1];
	if (slot >= btrfs_header_nritems(upper) - 1)
		return 1;

	btrfs_assert_tree_locked(path->nodes[1]);

3802
	right = btrfs_read_node_slot(upper, slot + 1);
3803 3804 3805 3806 3807
	/*
	 * slot + 1 is not valid or we fail to read the right node,
	 * no big deal, just return.
	 */
	if (IS_ERR(right))
T
Tsutomu Itoh 已提交
3808 3809
		return 1;

3810
	__btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3811
	btrfs_set_lock_blocking_write(right);
3812

3813
	free_space = btrfs_leaf_free_space(right);
3814 3815 3816 3817 3818
	if (free_space < data_size)
		goto out_unlock;

	/* cow and double check */
	ret = btrfs_cow_block(trans, root, right, upper,
3819
			      slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3820 3821 3822
	if (ret)
		goto out_unlock;

3823
	free_space = btrfs_leaf_free_space(right);
3824 3825 3826 3827 3828 3829 3830
	if (free_space < data_size)
		goto out_unlock;

	left_nritems = btrfs_header_nritems(left);
	if (left_nritems == 0)
		goto out_unlock;

3831 3832 3833 3834 3835 3836
	if (check_sibling_keys(left, right)) {
		ret = -EUCLEAN;
		btrfs_tree_unlock(right);
		free_extent_buffer(right);
		return ret;
	}
3837 3838 3839 3840
	if (path->slots[0] == left_nritems && !empty) {
		/* Key greater than all keys in the leaf, right neighbor has
		 * enough room for it and we're not emptying our leaf to delete
		 * it, therefore use right neighbor to insert the new item and
3841
		 * no need to touch/dirty our left leaf. */
3842 3843 3844 3845 3846 3847 3848 3849
		btrfs_tree_unlock(left);
		free_extent_buffer(left);
		path->nodes[0] = right;
		path->slots[0] = 0;
		path->slots[1]++;
		return 0;
	}

3850
	return __push_leaf_right(path, min_data_size, empty,
3851
				right, free_space, left_nritems, min_slot);
3852 3853 3854 3855 3856 3857
out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
}

C
Chris Mason 已提交
3858 3859 3860
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3861 3862 3863 3864
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
 * items
C
Chris Mason 已提交
3865
 */
3866
static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3867
				     int empty, struct extent_buffer *left,
3868 3869
				     int free_space, u32 right_nritems,
				     u32 max_slot)
3870
{
3871
	struct btrfs_fs_info *fs_info = left->fs_info;
3872 3873
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
3874 3875 3876
	int i;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
3877
	struct btrfs_item *item;
3878
	u32 old_left_nritems;
3879
	u32 nr;
C
Chris Mason 已提交
3880
	int ret = 0;
3881 3882
	u32 this_item_size;
	u32 old_left_item_size;
3883 3884
	struct btrfs_map_token token;

3885
	if (empty)
3886
		nr = min(right_nritems, max_slot);
3887
	else
3888
		nr = min(right_nritems - 1, max_slot);
3889 3890

	for (i = 0; i < nr; i++) {
3891
		item = btrfs_item_nr(i);
3892

Z
Zheng Yan 已提交
3893 3894 3895 3896
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
3897 3898
				int space = btrfs_leaf_free_space(right);

Z
Zheng Yan 已提交
3899 3900 3901 3902 3903
				if (space + push_space * 2 > free_space)
					break;
			}
		}

3904
		if (path->slots[0] == i)
3905
			push_space += data_size;
3906 3907 3908

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
3909
			break;
3910

3911
		push_items++;
3912 3913 3914
		push_space += this_item_size + sizeof(*item);
	}

3915
	if (push_items == 0) {
3916 3917
		ret = 1;
		goto out;
3918
	}
3919
	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3920

3921
	/* push data from right to left */
3922 3923 3924 3925 3926
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

3927
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
C
Chris Mason 已提交
3928
		     btrfs_item_offset_nr(right, push_items - 1);
3929

3930
	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3931
		     leaf_data_end(left) - push_space,
3932
		     BTRFS_LEAF_DATA_OFFSET +
3933
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
3934
		     push_space);
3935
	old_left_nritems = btrfs_header_nritems(left);
3936
	BUG_ON(old_left_nritems <= 0);
3937

3938
	btrfs_init_map_token(&token, left);
3939
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
3940
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3941
		u32 ioff;
3942

3943
		item = btrfs_item_nr(i);
3944

3945 3946 3947
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item,
		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3948
	}
3949
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3950 3951

	/* fixup right node */
J
Julia Lawall 已提交
3952 3953
	if (push_items > right_nritems)
		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
C
Chris Mason 已提交
3954
		       right_nritems);
3955 3956 3957

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3958
						  leaf_data_end(right);
3959
		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3960
				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3961
				      BTRFS_LEAF_DATA_OFFSET +
3962
				      leaf_data_end(right), push_space);
3963 3964

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3965 3966 3967
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
3968
	}
3969 3970

	btrfs_init_map_token(&token, right);
3971 3972
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
3973
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3974
	for (i = 0; i < right_nritems; i++) {
3975
		item = btrfs_item_nr(i);
3976

3977 3978
		push_space = push_space - btrfs_token_item_size(&token, item);
		btrfs_set_token_item_offset(&token, item, push_space);
3979
	}
3980

3981
	btrfs_mark_buffer_dirty(left);
3982 3983
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
3984
	else
3985
		btrfs_clean_tree_block(right);
3986

3987
	btrfs_item_key(right, &disk_key, 0);
3988
	fixup_low_keys(path, &disk_key, 1);
3989 3990 3991 3992

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
3993
		btrfs_tree_unlock(path->nodes[0]);
3994 3995
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
3996 3997
		path->slots[1] -= 1;
	} else {
3998
		btrfs_tree_unlock(left);
3999
		free_extent_buffer(left);
4000 4001
		path->slots[0] -= push_items;
	}
4002
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
4003
	return ret;
4004 4005 4006 4007
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
4008 4009
}

4010 4011 4012
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4013 4014 4015 4016
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
 * items
4017 4018
 */
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4019 4020
			  *root, struct btrfs_path *path, int min_data_size,
			  int data_size, int empty, u32 max_slot)
4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040
{
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
	int slot;
	int free_space;
	u32 right_nritems;
	int ret = 0;

	slot = path->slots[1];
	if (slot == 0)
		return 1;
	if (!path->nodes[1])
		return 1;

	right_nritems = btrfs_header_nritems(right);
	if (right_nritems == 0)
		return 1;

	btrfs_assert_tree_locked(path->nodes[1]);

4041
	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4042 4043 4044 4045 4046
	/*
	 * slot - 1 is not valid or we fail to read the left node,
	 * no big deal, just return.
	 */
	if (IS_ERR(left))
T
Tsutomu Itoh 已提交
4047 4048
		return 1;

4049
	__btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4050
	btrfs_set_lock_blocking_write(left);
4051

4052
	free_space = btrfs_leaf_free_space(left);
4053 4054 4055 4056 4057 4058 4059
	if (free_space < data_size) {
		ret = 1;
		goto out;
	}

	/* cow and double check */
	ret = btrfs_cow_block(trans, root, left,
4060
			      path->nodes[1], slot - 1, &left,
4061
			      BTRFS_NESTING_LEFT_COW);
4062 4063
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
4064 4065
		if (ret == -ENOSPC)
			ret = 1;
4066 4067 4068
		goto out;
	}

4069
	free_space = btrfs_leaf_free_space(left);
4070 4071 4072 4073 4074
	if (free_space < data_size) {
		ret = 1;
		goto out;
	}

4075 4076 4077 4078
	if (check_sibling_keys(left, right)) {
		ret = -EUCLEAN;
		goto out;
	}
4079
	return __push_leaf_left(path, min_data_size,
4080 4081
			       empty, left, free_space, right_nritems,
			       max_slot);
4082 4083 4084 4085 4086 4087 4088 4089 4090 4091
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
}

/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 */
4092 4093 4094 4095 4096
static noinline void copy_for_split(struct btrfs_trans_handle *trans,
				    struct btrfs_path *path,
				    struct extent_buffer *l,
				    struct extent_buffer *right,
				    int slot, int mid, int nritems)
4097
{
4098
	struct btrfs_fs_info *fs_info = trans->fs_info;
4099 4100 4101 4102
	int data_copy_size;
	int rt_data_off;
	int i;
	struct btrfs_disk_key disk_key;
4103 4104
	struct btrfs_map_token token;

4105 4106
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
4107
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4108 4109 4110 4111 4112 4113

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
4114 4115
		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4116
		     leaf_data_end(l), data_copy_size);
4117

4118
	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4119

4120
	btrfs_init_map_token(&token, right);
4121
	for (i = 0; i < nritems; i++) {
4122
		struct btrfs_item *item = btrfs_item_nr(i);
4123 4124
		u32 ioff;

4125 4126
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4127 4128 4129 4130
	}

	btrfs_set_header_nritems(l, mid);
	btrfs_item_key(right, &disk_key, 0);
4131
	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
	BUG_ON(path->slots[0] != slot);

	if (mid <= slot) {
		btrfs_tree_unlock(path->nodes[0]);
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
		path->slots[0] -= mid;
		path->slots[1] += 1;
	} else {
		btrfs_tree_unlock(right);
		free_extent_buffer(right);
	}

	BUG_ON(path->slots[0] < 0);
}

4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169
/*
 * double splits happen when we need to insert a big item in the middle
 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
 *          A                 B                 C
 *
 * We avoid this by trying to push the items on either side of our target
 * into the adjacent leaves.  If all goes well we can avoid the double split
 * completely.
 */
static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path,
					  int data_size)
{
	int ret;
	int progress = 0;
	int slot;
	u32 nritems;
4170
	int space_needed = data_size;
4171 4172

	slot = path->slots[0];
4173
	if (slot < btrfs_header_nritems(path->nodes[0]))
4174
		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4175 4176 4177 4178 4179

	/*
	 * try to push all the items after our slot into the
	 * right leaf
	 */
4180
	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194
	if (ret < 0)
		return ret;

	if (ret == 0)
		progress++;

	nritems = btrfs_header_nritems(path->nodes[0]);
	/*
	 * our goal is to get our slot at the start or end of a leaf.  If
	 * we've done so we're done
	 */
	if (path->slots[0] == 0 || path->slots[0] == nritems)
		return 0;

4195
	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4196 4197 4198 4199
		return 0;

	/* try to push all the items before our slot into the next leaf */
	slot = path->slots[0];
4200 4201
	space_needed = data_size;
	if (slot > 0)
4202
		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4203
	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214
	if (ret < 0)
		return ret;

	if (ret == 0)
		progress++;

	if (progress)
		return 0;
	return 1;
}

C
Chris Mason 已提交
4215 4216 4217
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
4218 4219
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
4220
 */
4221 4222
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
4223
			       const struct btrfs_key *ins_key,
4224 4225
			       struct btrfs_path *path, int data_size,
			       int extend)
4226
{
4227
	struct btrfs_disk_key disk_key;
4228
	struct extent_buffer *l;
4229
	u32 nritems;
4230 4231
	int mid;
	int slot;
4232
	struct extent_buffer *right;
4233
	struct btrfs_fs_info *fs_info = root->fs_info;
4234
	int ret = 0;
C
Chris Mason 已提交
4235
	int wret;
4236
	int split;
4237
	int num_doubles = 0;
4238
	int tried_avoid_double = 0;
C
Chris Mason 已提交
4239

4240 4241 4242
	l = path->nodes[0];
	slot = path->slots[0];
	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4243
	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4244 4245
		return -EOVERFLOW;

C
Chris Mason 已提交
4246
	/* first try to make some room by pushing left and right */
4247
	if (data_size && path->nodes[1]) {
4248 4249 4250
		int space_needed = data_size;

		if (slot < btrfs_header_nritems(l))
4251
			space_needed -= btrfs_leaf_free_space(l);
4252 4253 4254

		wret = push_leaf_right(trans, root, path, space_needed,
				       space_needed, 0, 0);
C
Chris Mason 已提交
4255
		if (wret < 0)
C
Chris Mason 已提交
4256
			return wret;
4257
		if (wret) {
4258 4259
			space_needed = data_size;
			if (slot > 0)
4260
				space_needed -= btrfs_leaf_free_space(l);
4261 4262
			wret = push_leaf_left(trans, root, path, space_needed,
					      space_needed, 0, (u32)-1);
4263 4264 4265 4266
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
4267

4268
		/* did the pushes work? */
4269
		if (btrfs_leaf_free_space(l) >= data_size)
4270
			return 0;
4271
	}
C
Chris Mason 已提交
4272

C
Chris Mason 已提交
4273
	if (!path->nodes[1]) {
4274
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
4275 4276 4277
		if (ret)
			return ret;
	}
4278
again:
4279
	split = 1;
4280
	l = path->nodes[0];
4281
	slot = path->slots[0];
4282
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
4283
	mid = (nritems + 1) / 2;
4284

4285 4286 4287
	if (mid <= slot) {
		if (nritems == 1 ||
		    leaf_space_used(l, mid, nritems - mid) + data_size >
4288
			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4289 4290 4291 4292 4293 4294
			if (slot >= nritems) {
				split = 0;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
4295
				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4296 4297
					if (data_size && !tried_avoid_double)
						goto push_for_double;
4298 4299 4300 4301 4302 4303
					split = 2;
				}
			}
		}
	} else {
		if (leaf_space_used(l, 0, mid) + data_size >
4304
			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4305 4306 4307 4308 4309 4310 4311 4312
			if (!extend && data_size && slot == 0) {
				split = 0;
			} else if ((extend || !data_size) && slot == 0) {
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
4313
				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4314 4315
					if (data_size && !tried_avoid_double)
						goto push_for_double;
4316
					split = 2;
4317 4318 4319 4320 4321 4322 4323 4324 4325 4326
				}
			}
		}
	}

	if (split == 0)
		btrfs_cpu_key_to_disk(&disk_key, ins_key);
	else
		btrfs_item_key(l, &disk_key, mid);

4327 4328 4329 4330 4331 4332 4333 4334
	/*
	 * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
	 * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
	 * subclasses, which is 8 at the time of this patch, and we've maxed it
	 * out.  In the future we could add a
	 * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
	 * use BTRFS_NESTING_NEW_ROOT.
	 */
4335
	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4336 4337 4338
					     l->start, 0, num_doubles ?
					     BTRFS_NESTING_NEW_ROOT :
					     BTRFS_NESTING_SPLIT);
4339
	if (IS_ERR(right))
4340
		return PTR_ERR(right);
4341

4342
	root_add_used(root, fs_info->nodesize);
4343

4344 4345 4346
	if (split == 0) {
		if (mid <= slot) {
			btrfs_set_header_nritems(right, 0);
4347
			insert_ptr(trans, path, &disk_key,
4348
				   right->start, path->slots[1] + 1, 1);
4349 4350 4351 4352 4353 4354 4355
			btrfs_tree_unlock(path->nodes[0]);
			free_extent_buffer(path->nodes[0]);
			path->nodes[0] = right;
			path->slots[0] = 0;
			path->slots[1] += 1;
		} else {
			btrfs_set_header_nritems(right, 0);
4356
			insert_ptr(trans, path, &disk_key,
4357
				   right->start, path->slots[1], 1);
4358 4359 4360 4361
			btrfs_tree_unlock(path->nodes[0]);
			free_extent_buffer(path->nodes[0]);
			path->nodes[0] = right;
			path->slots[0] = 0;
4362
			if (path->slots[1] == 0)
4363
				fixup_low_keys(path, &disk_key, 1);
4364
		}
4365 4366 4367 4368 4369
		/*
		 * We create a new leaf 'right' for the required ins_len and
		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
		 * the content of ins_len to 'right'.
		 */
4370
		return ret;
4371
	}
C
Chris Mason 已提交
4372

4373
	copy_for_split(trans, path, l, right, slot, mid, nritems);
Z
Zheng Yan 已提交
4374

4375
	if (split == 2) {
4376 4377 4378
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
4379
	}
4380

4381
	return 0;
4382 4383 4384 4385

push_for_double:
	push_for_double_split(trans, root, path, data_size);
	tried_avoid_double = 1;
4386
	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4387 4388
		return 0;
	goto again;
4389 4390
}

Y
Yan, Zheng 已提交
4391 4392 4393
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root,
					 struct btrfs_path *path, int ins_len)
4394
{
Y
Yan, Zheng 已提交
4395
	struct btrfs_key key;
4396
	struct extent_buffer *leaf;
Y
Yan, Zheng 已提交
4397 4398 4399 4400
	struct btrfs_file_extent_item *fi;
	u64 extent_len = 0;
	u32 item_size;
	int ret;
4401 4402

	leaf = path->nodes[0];
Y
Yan, Zheng 已提交
4403 4404 4405 4406 4407
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
	       key.type != BTRFS_EXTENT_CSUM_KEY);

4408
	if (btrfs_leaf_free_space(leaf) >= ins_len)
Y
Yan, Zheng 已提交
4409
		return 0;
4410 4411

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
Y
Yan, Zheng 已提交
4412 4413 4414 4415 4416
	if (key.type == BTRFS_EXTENT_DATA_KEY) {
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
	}
4417
	btrfs_release_path(path);
4418 4419

	path->keep_locks = 1;
Y
Yan, Zheng 已提交
4420 4421
	path->search_for_split = 1;
	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4422
	path->search_for_split = 0;
4423 4424
	if (ret > 0)
		ret = -EAGAIN;
Y
Yan, Zheng 已提交
4425 4426
	if (ret < 0)
		goto err;
4427

Y
Yan, Zheng 已提交
4428 4429
	ret = -EAGAIN;
	leaf = path->nodes[0];
4430 4431
	/* if our item isn't there, return now */
	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
Y
Yan, Zheng 已提交
4432 4433
		goto err;

4434
	/* the leaf has  changed, it now has room.  return now */
4435
	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4436 4437
		goto err;

Y
Yan, Zheng 已提交
4438 4439 4440 4441 4442
	if (key.type == BTRFS_EXTENT_DATA_KEY) {
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
			goto err;
4443 4444
	}

4445
	btrfs_set_path_blocking(path);
Y
Yan, Zheng 已提交
4446
	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4447 4448
	if (ret)
		goto err;
4449

Y
Yan, Zheng 已提交
4450
	path->keep_locks = 0;
4451
	btrfs_unlock_up_safe(path, 1);
Y
Yan, Zheng 已提交
4452 4453 4454 4455 4456 4457
	return 0;
err:
	path->keep_locks = 0;
	return ret;
}

4458
static noinline int split_item(struct btrfs_path *path,
4459
			       const struct btrfs_key *new_key,
Y
Yan, Zheng 已提交
4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471
			       unsigned long split_offset)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int slot;
	char *buf;
	u32 nritems;
	u32 item_size;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;

4472
	leaf = path->nodes[0];
4473
	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4474

4475 4476
	btrfs_set_path_blocking(path);

4477
	item = btrfs_item_nr(path->slots[0]);
4478 4479 4480 4481
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);

	buf = kmalloc(item_size, GFP_NOFS);
Y
Yan, Zheng 已提交
4482 4483 4484
	if (!buf)
		return -ENOMEM;

4485 4486 4487
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);

Y
Yan, Zheng 已提交
4488
	slot = path->slots[0] + 1;
4489 4490 4491 4492
	nritems = btrfs_header_nritems(leaf);
	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
Y
Yan, Zheng 已提交
4493 4494
				btrfs_item_nr_offset(slot),
				(nritems - slot) * sizeof(struct btrfs_item));
4495 4496 4497 4498 4499
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

4500
	new_item = btrfs_item_nr(slot);
4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

4522
	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4523
	kfree(buf);
Y
Yan, Zheng 已提交
4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544
	return 0;
}

/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
4545
		     const struct btrfs_key *new_key,
Y
Yan, Zheng 已提交
4546 4547 4548 4549 4550 4551 4552 4553
		     unsigned long split_offset)
{
	int ret;
	ret = setup_leaf_for_split(trans, root, path,
				   sizeof(struct btrfs_item));
	if (ret)
		return ret;

4554
	ret = split_item(path, new_key, split_offset);
4555 4556 4557
	return ret;
}

Y
Yan, Zheng 已提交
4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568
/*
 * This function duplicate a item, giving 'new_key' to the new item.
 * It guarantees both items live in the same tree leaf and the new item
 * is contiguous with the original item.
 *
 * This allows us to split file extent in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root,
			 struct btrfs_path *path,
4569
			 const struct btrfs_key *new_key)
Y
Yan, Zheng 已提交
4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582
{
	struct extent_buffer *leaf;
	int ret;
	u32 item_size;

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ret = setup_leaf_for_split(trans, root, path,
				   item_size + sizeof(struct btrfs_item));
	if (ret)
		return ret;

	path->slots[0]++;
4583
	setup_items_for_insert(root, path, new_key, &item_size, 1);
Y
Yan, Zheng 已提交
4584 4585 4586 4587 4588 4589 4590 4591
	leaf = path->nodes[0];
	memcpy_extent_buffer(leaf,
			     btrfs_item_ptr_offset(leaf, path->slots[0]),
			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
			     item_size);
	return 0;
}

C
Chris Mason 已提交
4592 4593 4594 4595 4596 4597
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
4598
void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
C
Chris Mason 已提交
4599 4600
{
	int slot;
4601 4602
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
4603 4604 4605 4606 4607 4608
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;
4609 4610
	struct btrfs_map_token token;

4611
	leaf = path->nodes[0];
4612 4613 4614 4615
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
4616
		return;
C
Chris Mason 已提交
4617

4618
	nritems = btrfs_header_nritems(leaf);
4619
	data_end = leaf_data_end(leaf);
C
Chris Mason 已提交
4620

4621
	old_data_start = btrfs_item_offset_nr(leaf, slot);
4622

C
Chris Mason 已提交
4623 4624 4625 4626 4627 4628 4629 4630 4631
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
4632
	btrfs_init_map_token(&token, leaf);
C
Chris Mason 已提交
4633
	for (i = slot; i < nritems; i++) {
4634
		u32 ioff;
4635
		item = btrfs_item_nr(i);
4636

4637 4638
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
C
Chris Mason 已提交
4639
	}
4640

C
Chris Mason 已提交
4641
	/* shift the data */
4642
	if (from_end) {
4643 4644
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
4665
				      (unsigned long)fi,
4666
				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4667 4668 4669
			}
		}

4670 4671
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4672 4673 4674 4675 4676 4677
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
4678
			fixup_low_keys(path, &disk_key, 1);
4679
	}
4680

4681
	item = btrfs_item_nr(slot);
4682 4683
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
4684

4685
	if (btrfs_leaf_free_space(leaf) < 0) {
4686
		btrfs_print_leaf(leaf);
C
Chris Mason 已提交
4687
		BUG();
4688
	}
C
Chris Mason 已提交
4689 4690
}

C
Chris Mason 已提交
4691
/*
S
Stefan Behrens 已提交
4692
 * make the item pointed to by the path bigger, data_size is the added size.
C
Chris Mason 已提交
4693
 */
4694
void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4695 4696
{
	int slot;
4697 4698
	struct extent_buffer *leaf;
	struct btrfs_item *item;
4699 4700 4701 4702 4703
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;
4704 4705
	struct btrfs_map_token token;

4706
	leaf = path->nodes[0];
4707

4708
	nritems = btrfs_header_nritems(leaf);
4709
	data_end = leaf_data_end(leaf);
4710

4711
	if (btrfs_leaf_free_space(leaf) < data_size) {
4712
		btrfs_print_leaf(leaf);
4713
		BUG();
4714
	}
4715
	slot = path->slots[0];
4716
	old_data = btrfs_item_end_nr(leaf, slot);
4717 4718

	BUG_ON(slot < 0);
4719
	if (slot >= nritems) {
4720
		btrfs_print_leaf(leaf);
4721
		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4722
			   slot, nritems);
4723
		BUG();
4724
	}
4725 4726 4727 4728 4729

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
4730
	btrfs_init_map_token(&token, leaf);
4731
	for (i = slot; i < nritems; i++) {
4732
		u32 ioff;
4733
		item = btrfs_item_nr(i);
4734

4735 4736
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff - data_size);
4737
	}
4738

4739
	/* shift the data */
4740 4741
	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4742
		      data_end, old_data - data_end);
4743

4744
	data_end = old_data;
4745
	old_size = btrfs_item_size_nr(leaf, slot);
4746
	item = btrfs_item_nr(slot);
4747 4748
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
4749

4750
	if (btrfs_leaf_free_space(leaf) < 0) {
4751
		btrfs_print_leaf(leaf);
4752
		BUG();
4753
	}
4754 4755
}

4756 4757 4758 4759 4760 4761 4762 4763 4764 4765
/**
 * setup_items_for_insert - Helper called before inserting one or more items
 * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
 * in a function that doesn't call btrfs_search_slot
 *
 * @root:	root we are inserting items to
 * @path:	points to the leaf/slot where we are going to insert new items
 * @cpu_key:	array of keys for items to be inserted
 * @data_size:	size of the body of each item we are going to insert
 * @nr:		size of @cpu_key/@data_size arrays
C
Chris Mason 已提交
4766
 */
4767
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4768
			    const struct btrfs_key *cpu_key, u32 *data_size,
4769
			    int nr)
4770
{
4771
	struct btrfs_fs_info *fs_info = root->fs_info;
4772
	struct btrfs_item *item;
4773
	int i;
4774
	u32 nritems;
4775
	unsigned int data_end;
C
Chris Mason 已提交
4776
	struct btrfs_disk_key disk_key;
4777 4778
	struct extent_buffer *leaf;
	int slot;
4779
	struct btrfs_map_token token;
4780 4781 4782 4783 4784 4785
	u32 total_size;
	u32 total_data = 0;

	for (i = 0; i < nr; i++)
		total_data += data_size[i];
	total_size = total_data + (nr * sizeof(struct btrfs_item));
4786

4787 4788
	if (path->slots[0] == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4789
		fixup_low_keys(path, &disk_key, 1);
4790 4791 4792
	}
	btrfs_unlock_up_safe(path, 1);

4793
	leaf = path->nodes[0];
4794
	slot = path->slots[0];
C
Chris Mason 已提交
4795

4796
	nritems = btrfs_header_nritems(leaf);
4797
	data_end = leaf_data_end(leaf);
4798

4799
	if (btrfs_leaf_free_space(leaf) < total_size) {
4800
		btrfs_print_leaf(leaf);
4801
		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4802
			   total_size, btrfs_leaf_free_space(leaf));
4803
		BUG();
4804
	}
4805

4806
	btrfs_init_map_token(&token, leaf);
4807
	if (slot != nritems) {
4808
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4809

4810
		if (old_data < data_end) {
4811
			btrfs_print_leaf(leaf);
4812
			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
J
Jeff Mahoney 已提交
4813
				   slot, old_data, data_end);
4814
			BUG();
4815
		}
4816 4817 4818 4819
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
C
Chris Mason 已提交
4820
		for (i = slot; i < nritems; i++) {
4821
			u32 ioff;
4822

4823
			item = btrfs_item_nr(i);
4824 4825 4826
			ioff = btrfs_token_item_offset(&token, item);
			btrfs_set_token_item_offset(&token, item,
						    ioff - total_data);
C
Chris Mason 已提交
4827
		}
4828
		/* shift the items */
4829
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4830
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
4831
			      (nritems - slot) * sizeof(struct btrfs_item));
4832 4833

		/* shift the data */
4834 4835
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
C
Chris Mason 已提交
4836
			      data_end, old_data - data_end);
4837 4838
		data_end = old_data;
	}
4839

4840
	/* setup the item for the new data */
4841 4842 4843
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
4844
		item = btrfs_item_nr(slot + i);
4845
		data_end -= data_size[i];
4846
		btrfs_set_token_item_offset(&token, item, data_end);
4847
		btrfs_set_token_item_size(&token, item, data_size[i]);
4848
	}
4849

4850
	btrfs_set_header_nritems(leaf, nritems + nr);
4851
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
4852

4853
	if (btrfs_leaf_free_space(leaf) < 0) {
4854
		btrfs_print_leaf(leaf);
4855
		BUG();
4856
	}
4857 4858 4859 4860 4861 4862 4863 4864 4865
}

/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
4866
			    const struct btrfs_key *cpu_key, u32 *data_size,
4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882
			    int nr)
{
	int ret = 0;
	int slot;
	int i;
	u32 total_size = 0;
	u32 total_data = 0;

	for (i = 0; i < nr; i++)
		total_data += data_size[i];

	total_size = total_data + (nr * sizeof(struct btrfs_item));
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
4883
		return ret;
4884 4885 4886 4887

	slot = path->slots[0];
	BUG_ON(slot < 0);

4888
	setup_items_for_insert(root, path, cpu_key, data_size, nr);
4889
	return 0;
4890 4891 4892 4893 4894 4895
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
4896 4897 4898
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *cpu_key, void *data,
		      u32 data_size)
4899 4900
{
	int ret = 0;
C
Chris Mason 已提交
4901
	struct btrfs_path *path;
4902 4903
	struct extent_buffer *leaf;
	unsigned long ptr;
4904

C
Chris Mason 已提交
4905
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4906 4907
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
4908
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4909
	if (!ret) {
4910 4911 4912 4913
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
4914
	}
C
Chris Mason 已提交
4915
	btrfs_free_path(path);
C
Chris Mason 已提交
4916
	return ret;
4917 4918
}

C
Chris Mason 已提交
4919
/*
C
Chris Mason 已提交
4920
 * delete the pointer from a given node.
C
Chris Mason 已提交
4921
 *
C
Chris Mason 已提交
4922 4923
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
4924
 */
4925 4926
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
		    int level, int slot)
4927
{
4928
	struct extent_buffer *parent = path->nodes[level];
4929
	u32 nritems;
4930
	int ret;
4931

4932
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
4933
	if (slot != nritems - 1) {
4934 4935
		if (level) {
			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4936
					nritems - slot - 1);
4937 4938
			BUG_ON(ret < 0);
		}
4939 4940 4941
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
4942 4943
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
4944
	} else if (level) {
4945 4946
		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
				GFP_NOFS);
4947
		BUG_ON(ret < 0);
4948
	}
4949

4950
	nritems--;
4951
	btrfs_set_header_nritems(parent, nritems);
4952
	if (nritems == 0 && parent == root->node) {
4953
		BUG_ON(btrfs_header_level(root->node) != 1);
4954
		/* just turn the root into a leaf and break */
4955
		btrfs_set_header_level(root->node, 0);
4956
	} else if (slot == 0) {
4957 4958 4959
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
4960
		fixup_low_keys(path, &disk_key, level + 1);
4961
	}
C
Chris Mason 已提交
4962
	btrfs_mark_buffer_dirty(parent);
4963 4964
}

4965 4966
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
4967
 * path->nodes[1].
4968 4969 4970 4971 4972 4973 4974
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
4975 4976 4977 4978
static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root,
				    struct btrfs_path *path,
				    struct extent_buffer *leaf)
4979
{
4980
	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4981
	del_ptr(root, path, 1, path->slots[1]);
4982

4983 4984 4985 4986 4987 4988
	/*
	 * btrfs_free_extent is expensive, we want to make sure we
	 * aren't holding any locks when we call it
	 */
	btrfs_unlock_up_safe(path, 0);

4989 4990
	root_sub_used(root, leaf->len);

D
David Sterba 已提交
4991
	atomic_inc(&leaf->refs);
4992
	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4993
	free_extent_buffer_stale(leaf);
4994
}
C
Chris Mason 已提交
4995 4996 4997 4998
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
4999 5000
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
5001
{
5002
	struct btrfs_fs_info *fs_info = root->fs_info;
5003 5004
	struct extent_buffer *leaf;
	struct btrfs_item *item;
5005 5006
	u32 last_off;
	u32 dsize = 0;
C
Chris Mason 已提交
5007 5008
	int ret = 0;
	int wret;
5009
	int i;
5010
	u32 nritems;
5011

5012
	leaf = path->nodes[0];
5013 5014 5015 5016 5017
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

5018
	nritems = btrfs_header_nritems(leaf);
5019

5020
	if (slot + nr != nritems) {
5021
		int data_end = leaf_data_end(leaf);
5022
		struct btrfs_map_token token;
5023

5024
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
C
Chris Mason 已提交
5025
			      data_end + dsize,
5026
			      BTRFS_LEAF_DATA_OFFSET + data_end,
5027
			      last_off - data_end);
5028

5029
		btrfs_init_map_token(&token, leaf);
5030
		for (i = slot + nr; i < nritems; i++) {
5031
			u32 ioff;
5032

5033
			item = btrfs_item_nr(i);
5034 5035
			ioff = btrfs_token_item_offset(&token, item);
			btrfs_set_token_item_offset(&token, item, ioff + dsize);
C
Chris Mason 已提交
5036
		}
5037

5038
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5039
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
5040
			      sizeof(struct btrfs_item) *
5041
			      (nritems - slot - nr));
5042
	}
5043 5044
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
5045

C
Chris Mason 已提交
5046
	/* delete the leaf if we've emptied it */
5047
	if (nritems == 0) {
5048 5049
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
5050
		} else {
5051
			btrfs_set_path_blocking(path);
5052
			btrfs_clean_tree_block(leaf);
5053
			btrfs_del_leaf(trans, root, path, leaf);
5054
		}
5055
	} else {
5056
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
5057
		if (slot == 0) {
5058 5059 5060
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
5061
			fixup_low_keys(path, &disk_key, 1);
C
Chris Mason 已提交
5062 5063
		}

C
Chris Mason 已提交
5064
		/* delete the leaf if it is mostly empty */
5065
		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5066 5067 5068 5069
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
5070
			slot = path->slots[1];
D
David Sterba 已提交
5071
			atomic_inc(&leaf->refs);
5072

5073
			btrfs_set_path_blocking(path);
5074 5075
			wret = push_leaf_left(trans, root, path, 1, 1,
					      1, (u32)-1);
5076
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
5077
				ret = wret;
5078 5079 5080

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
5081 5082
				wret = push_leaf_right(trans, root, path, 1,
						       1, 1, 0);
5083
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
5084 5085
					ret = wret;
			}
5086 5087

			if (btrfs_header_nritems(leaf) == 0) {
5088
				path->slots[1] = slot;
5089
				btrfs_del_leaf(trans, root, path, leaf);
5090
				free_extent_buffer(leaf);
5091
				ret = 0;
C
Chris Mason 已提交
5092
			} else {
5093 5094 5095 5096 5097 5098 5099
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
5100
				free_extent_buffer(leaf);
5101
			}
5102
		} else {
5103
			btrfs_mark_buffer_dirty(leaf);
5104 5105
		}
	}
C
Chris Mason 已提交
5106
	return ret;
5107 5108
}

5109
/*
5110
 * search the tree again to find a leaf with lesser keys
5111 5112
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
5113 5114 5115
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
5116
 */
5117
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5118
{
5119 5120 5121
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
5122

5123
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5124

5125
	if (key.offset > 0) {
5126
		key.offset--;
5127
	} else if (key.type > 0) {
5128
		key.type--;
5129 5130
		key.offset = (u64)-1;
	} else if (key.objectid > 0) {
5131
		key.objectid--;
5132 5133 5134
		key.type = (u8)-1;
		key.offset = (u64)-1;
	} else {
5135
		return 1;
5136
	}
5137

5138
	btrfs_release_path(path);
5139 5140 5141 5142 5143
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154
	/*
	 * We might have had an item with the previous key in the tree right
	 * before we released our path. And after we released our path, that
	 * item might have been pushed to the first slot (0) of the leaf we
	 * were holding due to a tree balance. Alternatively, an item with the
	 * previous key can exist as the only element of a leaf (big fat item).
	 * Therefore account for these 2 cases, so that our callers (like
	 * btrfs_previous_item) don't miss an existing item with a key matching
	 * the previous key we computed above.
	 */
	if (ret <= 0)
5155 5156
		return 0;
	return 1;
5157 5158
}

5159 5160
/*
 * A helper function to walk down the tree starting at min_key, and looking
5161 5162
 * for nodes or leaves that are have a minimum transaction id.
 * This is used by the btree defrag code, and tree logging
5163 5164 5165 5166 5167 5168 5169 5170
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
5171 5172 5173 5174
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
5175 5176 5177 5178
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5179
			 struct btrfs_path *path,
5180 5181 5182 5183 5184
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
5185
	int sret;
5186 5187 5188
	u32 nritems;
	int level;
	int ret = 1;
5189
	int keep_locks = path->keep_locks;
5190

5191
	path->keep_locks = 1;
5192
again:
5193
	cur = btrfs_read_lock_root_node(root);
5194
	level = btrfs_header_level(cur);
5195
	WARN_ON(path->nodes[level]);
5196
	path->nodes[level] = cur;
5197
	path->locks[level] = BTRFS_READ_LOCK;
5198 5199 5200 5201 5202

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
5203
	while (1) {
5204 5205
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
5206
		sret = btrfs_bin_search(cur, min_key, &slot);
5207 5208 5209 5210
		if (sret < 0) {
			ret = sret;
			goto out;
		}
5211

5212 5213
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
5214 5215
			if (slot >= nritems)
				goto find_next_key;
5216 5217 5218 5219 5220
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
5221 5222
		if (sret && slot > 0)
			slot--;
5223
		/*
5224
		 * check this node pointer against the min_trans parameters.
5225
		 * If it is too old, skip to the next one.
5226
		 */
C
Chris Mason 已提交
5227
		while (slot < nritems) {
5228
			u64 gen;
5229

5230 5231 5232 5233 5234
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
5235
			break;
5236
		}
5237
find_next_key:
5238 5239 5240 5241 5242
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
5243
			path->slots[level] = slot;
5244
			btrfs_set_path_blocking(path);
5245
			sret = btrfs_find_next_key(root, path, min_key, level,
5246
						  min_trans);
5247
			if (sret == 0) {
5248
				btrfs_release_path(path);
5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			goto out;
		}
5261
		btrfs_set_path_blocking(path);
5262
		cur = btrfs_read_node_slot(cur, slot);
5263 5264 5265 5266
		if (IS_ERR(cur)) {
			ret = PTR_ERR(cur);
			goto out;
		}
5267

5268
		btrfs_tree_read_lock(cur);
5269

5270
		path->locks[level - 1] = BTRFS_READ_LOCK;
5271
		path->nodes[level - 1] = cur;
5272
		unlock_up(path, level, 1, 0, NULL);
5273 5274
	}
out:
5275 5276 5277 5278
	path->keep_locks = keep_locks;
	if (ret == 0) {
		btrfs_unlock_up_safe(path, path->lowest_level + 1);
		btrfs_set_path_blocking(path);
5279
		memcpy(min_key, &found_key, sizeof(found_key));
5280
	}
5281 5282 5283 5284 5285 5286
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
5287
 * tree based on the current path and the min_trans parameters.
5288 5289 5290 5291 5292 5293 5294
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
5295
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5296
			struct btrfs_key *key, int level, u64 min_trans)
5297 5298 5299 5300
{
	int slot;
	struct extent_buffer *c;

5301
	WARN_ON(!path->keep_locks && !path->skip_locking);
C
Chris Mason 已提交
5302
	while (level < BTRFS_MAX_LEVEL) {
5303 5304 5305 5306 5307
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
5308
next:
5309
		if (slot >= btrfs_header_nritems(c)) {
5310 5311 5312 5313 5314
			int ret;
			int orig_lowest;
			struct btrfs_key cur_key;
			if (level + 1 >= BTRFS_MAX_LEVEL ||
			    !path->nodes[level + 1])
5315
				return 1;
5316

5317
			if (path->locks[level + 1] || path->skip_locking) {
5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328
				level++;
				continue;
			}

			slot = btrfs_header_nritems(c) - 1;
			if (level == 0)
				btrfs_item_key_to_cpu(c, &cur_key, slot);
			else
				btrfs_node_key_to_cpu(c, &cur_key, slot);

			orig_lowest = path->lowest_level;
5329
			btrfs_release_path(path);
5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341
			path->lowest_level = level;
			ret = btrfs_search_slot(NULL, root, &cur_key, path,
						0, 0);
			path->lowest_level = orig_lowest;
			if (ret < 0)
				return ret;

			c = path->nodes[level];
			slot = path->slots[level];
			if (ret == 0)
				slot++;
			goto next;
5342
		}
5343

5344 5345
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
5346 5347 5348 5349 5350 5351 5352
		else {
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (gen < min_trans) {
				slot++;
				goto next;
			}
5353
			btrfs_node_key_to_cpu(c, key, slot);
5354
		}
5355 5356 5357 5358 5359
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
5360
/*
5361
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
5362 5363
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
5364
 */
C
Chris Mason 已提交
5365
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
J
Jan Schmidt 已提交
5366 5367 5368 5369 5370 5371
{
	return btrfs_next_old_leaf(root, path, 0);
}

int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			u64 time_seq)
5372 5373
{
	int slot;
5374
	int level;
5375
	struct extent_buffer *c;
5376
	struct extent_buffer *next;
5377 5378 5379
	struct btrfs_key key;
	u32 nritems;
	int ret;
5380
	int old_spinning = path->leave_spinning;
5381
	int next_rw_lock = 0;
5382 5383

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
5384
	if (nritems == 0)
5385 5386
		return 1;

5387 5388 5389 5390
	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
	level = 1;
	next = NULL;
5391
	next_rw_lock = 0;
5392
	btrfs_release_path(path);
5393

5394
	path->keep_locks = 1;
5395
	path->leave_spinning = 1;
5396

J
Jan Schmidt 已提交
5397 5398 5399 5400
	if (time_seq)
		ret = btrfs_search_old_slot(root, &key, path, time_seq);
	else
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5401 5402 5403 5404 5405
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

5406
	nritems = btrfs_header_nritems(path->nodes[0]);
5407 5408 5409 5410 5411 5412
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
5413
	if (nritems > 0 && path->slots[0] < nritems - 1) {
5414 5415
		if (ret == 0)
			path->slots[0]++;
5416
		ret = 0;
5417 5418
		goto done;
	}
5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436
	/*
	 * So the above check misses one case:
	 * - after releasing the path above, someone has removed the item that
	 *   used to be at the very end of the block, and balance between leafs
	 *   gets another one with bigger key.offset to replace it.
	 *
	 * This one should be returned as well, or we can get leaf corruption
	 * later(esp. in __btrfs_drop_extents()).
	 *
	 * And a bit more explanation about this check,
	 * with ret > 0, the key isn't found, the path points to the slot
	 * where it should be inserted, so the path->slots[0] item must be the
	 * bigger one.
	 */
	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
		ret = 0;
		goto done;
	}
5437

C
Chris Mason 已提交
5438
	while (level < BTRFS_MAX_LEVEL) {
5439 5440 5441 5442
		if (!path->nodes[level]) {
			ret = 1;
			goto done;
		}
5443

5444 5445
		slot = path->slots[level] + 1;
		c = path->nodes[level];
5446
		if (slot >= btrfs_header_nritems(c)) {
5447
			level++;
5448 5449 5450 5451
			if (level == BTRFS_MAX_LEVEL) {
				ret = 1;
				goto done;
			}
5452 5453
			continue;
		}
5454

5455
		if (next) {
5456
			btrfs_tree_unlock_rw(next, next_rw_lock);
5457
			free_extent_buffer(next);
5458
		}
5459

5460
		next = c;
5461
		next_rw_lock = path->locks[level];
5462
		ret = read_block_for_search(root, path, &next, level,
5463
					    slot, &key);
5464 5465
		if (ret == -EAGAIN)
			goto again;
5466

5467
		if (ret < 0) {
5468
			btrfs_release_path(path);
5469 5470 5471
			goto done;
		}

5472
		if (!path->skip_locking) {
5473
			ret = btrfs_try_tree_read_lock(next);
5474 5475 5476 5477 5478 5479 5480 5481
			if (!ret && time_seq) {
				/*
				 * If we don't get the lock, we may be racing
				 * with push_leaf_left, holding that lock while
				 * itself waiting for the leaf we've currently
				 * locked. To solve this situation, we give up
				 * on our lock and cycle.
				 */
5482
				free_extent_buffer(next);
5483 5484 5485 5486
				btrfs_release_path(path);
				cond_resched();
				goto again;
			}
5487 5488
			if (!ret) {
				btrfs_set_path_blocking(path);
5489
				__btrfs_tree_read_lock(next,
5490
						       BTRFS_NESTING_RIGHT,
5491
						       path->recurse);
5492
			}
5493
			next_rw_lock = BTRFS_READ_LOCK;
5494
		}
5495 5496 5497
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
5498
	while (1) {
5499 5500
		level--;
		c = path->nodes[level];
5501
		if (path->locks[level])
5502
			btrfs_tree_unlock_rw(c, path->locks[level]);
5503

5504
		free_extent_buffer(c);
5505 5506
		path->nodes[level] = next;
		path->slots[level] = 0;
5507
		if (!path->skip_locking)
5508
			path->locks[level] = next_rw_lock;
5509 5510
		if (!level)
			break;
5511

5512
		ret = read_block_for_search(root, path, &next, level,
5513
					    0, &key);
5514 5515 5516
		if (ret == -EAGAIN)
			goto again;

5517
		if (ret < 0) {
5518
			btrfs_release_path(path);
5519 5520 5521
			goto done;
		}

5522
		if (!path->skip_locking) {
5523
			ret = btrfs_try_tree_read_lock(next);
5524 5525
			if (!ret) {
				btrfs_set_path_blocking(path);
5526
				__btrfs_tree_read_lock(next,
5527
						       BTRFS_NESTING_RIGHT,
5528
						       path->recurse);
5529
			}
5530
			next_rw_lock = BTRFS_READ_LOCK;
5531
		}
5532
	}
5533
	ret = 0;
5534
done:
5535
	unlock_up(path, 0, 1, 0, NULL);
5536 5537 5538 5539 5540
	path->leave_spinning = old_spinning;
	if (!old_spinning)
		btrfs_set_path_blocking(path);

	return ret;
5541
}
5542

5543 5544 5545 5546 5547 5548
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
5549 5550 5551 5552 5553 5554
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
5555
	u32 nritems;
5556 5557
	int ret;

C
Chris Mason 已提交
5558
	while (1) {
5559
		if (path->slots[0] == 0) {
5560
			btrfs_set_path_blocking(path);
5561 5562 5563 5564 5565 5566 5567
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
5568 5569 5570 5571 5572 5573
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

5574
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5575 5576
		if (found_key.objectid < min_objectid)
			break;
5577 5578
		if (found_key.type == type)
			return 0;
5579 5580 5581
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
5582 5583 5584
	}
	return 1;
}
5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627

/*
 * search in extent tree to find a previous Metadata/Data extent item with
 * min objecitd.
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
int btrfs_previous_extent_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	u32 nritems;
	int ret;

	while (1) {
		if (path->slots[0] == 0) {
			btrfs_set_path_blocking(path);
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
		    found_key.type == BTRFS_METADATA_ITEM_KEY)
			return 0;
		if (found_key.objectid == min_objectid &&
		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
			break;
	}
	return 1;
}