ctree.c 140.7 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
C
Chris Mason 已提交
2
/*
C
Chris Mason 已提交
3
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
4 5
 */

6
#include <linux/sched.h>
7
#include <linux/slab.h>
8
#include <linux/rbtree.h>
9
#include <linux/mm.h>
10 11
#include "ctree.h"
#include "disk-io.h"
12
#include "transaction.h"
13
#include "print-tree.h"
14
#include "locking.h"
15
#include "volumes.h"
16
#include "qgroup.h"
17

18 19
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
20 21 22
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *ins_key, struct btrfs_path *path,
		      int data_size, int extend);
23
static int push_node_left(struct btrfs_trans_handle *trans,
24
			  struct extent_buffer *dst,
25
			  struct extent_buffer *src, int empty);
26 27 28
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
29 30
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
		    int level, int slot);
31

32 33
static const struct btrfs_csums {
	u16		size;
34 35
	const char	name[10];
	const char	driver[12];
36 37
} btrfs_csums[] = {
	[BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38
	[BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39
	[BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40 41
	[BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
				     .driver = "blake2b-256" },
42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
};

int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
	u16 t = btrfs_super_csum_type(s);
	/*
	 * csum type is validated at mount time
	 */
	return btrfs_csums[t].size;
}

const char *btrfs_super_csum_name(u16 csum_type)
{
	/* csum type is validated at mount time */
	return btrfs_csums[csum_type].name;
}

59 60 61 62 63 64 65
/*
 * Return driver name if defined, otherwise the name that's also a valid driver
 * name
 */
const char *btrfs_super_csum_driver(u16 csum_type)
{
	/* csum type is validated at mount time */
66 67
	return btrfs_csums[csum_type].driver[0] ?
		btrfs_csums[csum_type].driver :
68 69 70
		btrfs_csums[csum_type].name;
}

71
size_t __attribute_const__ btrfs_get_num_csums(void)
72 73 74 75
{
	return ARRAY_SIZE(btrfs_csums);
}

C
Chris Mason 已提交
76
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
77
{
78
	return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
C
Chris Mason 已提交
79 80
}

C
Chris Mason 已提交
81
/* this also releases the path */
C
Chris Mason 已提交
82
void btrfs_free_path(struct btrfs_path *p)
83
{
84 85
	if (!p)
		return;
86
	btrfs_release_path(p);
C
Chris Mason 已提交
87
	kmem_cache_free(btrfs_path_cachep, p);
88 89
}

C
Chris Mason 已提交
90 91 92 93 94 95
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
96
noinline void btrfs_release_path(struct btrfs_path *p)
97 98
{
	int i;
99

C
Chris Mason 已提交
100
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101
		p->slots[i] = 0;
102
		if (!p->nodes[i])
103 104
			continue;
		if (p->locks[i]) {
105
			btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106 107
			p->locks[i] = 0;
		}
108
		free_extent_buffer(p->nodes[i]);
109
		p->nodes[i] = NULL;
110 111 112
	}
}

C
Chris Mason 已提交
113 114 115 116 117 118 119 120 121 122
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
123 124 125
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
126

127 128 129 130 131 132
	while (1) {
		rcu_read_lock();
		eb = rcu_dereference(root->node);

		/*
		 * RCU really hurts here, we could free up the root node because
133
		 * it was COWed but we may not get the new root node yet so do
134 135 136 137 138 139 140 141 142 143
		 * the inc_not_zero dance and if it doesn't work then
		 * synchronize_rcu and try again.
		 */
		if (atomic_inc_not_zero(&eb->refs)) {
			rcu_read_unlock();
			break;
		}
		rcu_read_unlock();
		synchronize_rcu();
	}
144 145 146
	return eb;
}

147 148 149 150
/*
 * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
 * just get put onto a simple dirty list.  Transaction walks this list to make
 * sure they get properly updated on disk.
C
Chris Mason 已提交
151
 */
152 153
static void add_root_to_dirty_list(struct btrfs_root *root)
{
154 155
	struct btrfs_fs_info *fs_info = root->fs_info;

156 157 158 159
	if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
	    !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
		return;

160
	spin_lock(&fs_info->trans_lock);
161 162
	if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
		/* Want the extent tree to be the last on the list */
163
		if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164
			list_move_tail(&root->dirty_list,
165
				       &fs_info->dirty_cowonly_roots);
166 167
		else
			list_move(&root->dirty_list,
168
				  &fs_info->dirty_cowonly_roots);
169
	}
170
	spin_unlock(&fs_info->trans_lock);
171 172
}

C
Chris Mason 已提交
173 174 175 176 177
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
178 179 180 181 182
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
183
	struct btrfs_fs_info *fs_info = root->fs_info;
184 185 186
	struct extent_buffer *cow;
	int ret = 0;
	int level;
187
	struct btrfs_disk_key disk_key;
188

189
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190
		trans->transid != fs_info->running_transaction->transid);
191
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192
		trans->transid != root->last_trans);
193 194

	level = btrfs_header_level(buf);
195 196 197 198
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);
Z
Zheng Yan 已提交
199

200 201
	cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
			&disk_key, level, buf->start, 0);
202
	if (IS_ERR(cow))
203 204
		return PTR_ERR(cow);

205
	copy_extent_buffer_full(cow, buf);
206 207
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
208 209 210 211 212 213 214
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, new_root_objectid);
215

216
	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Y
Yan Zheng 已提交
217

218
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
219
	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
220
		ret = btrfs_inc_ref(trans, root, cow, 1);
221
	else
222
		ret = btrfs_inc_ref(trans, root, cow, 0);
223

224 225 226 227 228 229 230 231
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
enum mod_log_op {
	MOD_LOG_KEY_REPLACE,
	MOD_LOG_KEY_ADD,
	MOD_LOG_KEY_REMOVE,
	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
	MOD_LOG_MOVE_KEYS,
	MOD_LOG_ROOT_REPLACE,
};

struct tree_mod_root {
	u64 logical;
	u8 level;
};

struct tree_mod_elem {
	struct rb_node node;
249
	u64 logical;
250
	u64 seq;
251 252 253 254 255 256 257 258 259 260 261 262 263
	enum mod_log_op op;

	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
	int slot;

	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
	u64 generation;

	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
	struct btrfs_disk_key key;
	u64 blockptr;

	/* this is used for op == MOD_LOG_MOVE_KEYS */
264 265 266 267
	struct {
		int dst_slot;
		int nr_items;
	} move;
268 269 270 271 272

	/* this is used for op == MOD_LOG_ROOT_REPLACE */
	struct tree_mod_root old_root;
};

273
/*
J
Josef Bacik 已提交
274
 * Pull a new tree mod seq number for our operation.
275
 */
J
Josef Bacik 已提交
276
static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
277 278 279 280
{
	return atomic64_inc_return(&fs_info->tree_mod_seq);
}

281 282 283 284 285 286 287 288 289 290
/*
 * This adds a new blocker to the tree mod log's blocker list if the @elem
 * passed does not already have a sequence number set. So when a caller expects
 * to record tree modifications, it should ensure to set elem->seq to zero
 * before calling btrfs_get_tree_mod_seq.
 * Returns a fresh, unused tree log modification sequence number, even if no new
 * blocker was added.
 */
u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
			   struct seq_list *elem)
291
{
292
	write_lock(&fs_info->tree_mod_log_lock);
293
	if (!elem->seq) {
J
Josef Bacik 已提交
294
		elem->seq = btrfs_inc_tree_mod_seq(fs_info);
295 296
		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
	}
297
	write_unlock(&fs_info->tree_mod_log_lock);
298

J
Josef Bacik 已提交
299
	return elem->seq;
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
}

void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
			    struct seq_list *elem)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct rb_node *next;
	struct tree_mod_elem *tm;
	u64 min_seq = (u64)-1;
	u64 seq_putting = elem->seq;

	if (!seq_putting)
		return;

315
	write_lock(&fs_info->tree_mod_log_lock);
316
	list_del(&elem->list);
317
	elem->seq = 0;
318

319 320 321 322 323 324 325 326 327 328 329 330
	if (!list_empty(&fs_info->tree_mod_seq_list)) {
		struct seq_list *first;

		first = list_first_entry(&fs_info->tree_mod_seq_list,
					 struct seq_list, list);
		if (seq_putting > first->seq) {
			/*
			 * Blocker with lower sequence number exists, we
			 * cannot remove anything from the log.
			 */
			write_unlock(&fs_info->tree_mod_log_lock);
			return;
331
		}
332
		min_seq = first->seq;
333
	}
334

335 336 337 338 339 340 341
	/*
	 * anything that's lower than the lowest existing (read: blocked)
	 * sequence number can be removed from the tree.
	 */
	tm_root = &fs_info->tree_mod_log;
	for (node = rb_first(tm_root); node; node = next) {
		next = rb_next(node);
342
		tm = rb_entry(node, struct tree_mod_elem, node);
343
		if (tm->seq >= min_seq)
344 345 346 347
			continue;
		rb_erase(node, tm_root);
		kfree(tm);
	}
348
	write_unlock(&fs_info->tree_mod_log_lock);
349 350 351 352
}

/*
 * key order of the log:
353
 *       node/leaf start address -> sequence
354
 *
355 356 357
 * The 'start address' is the logical address of the *new* root node
 * for root replace operations, or the logical address of the affected
 * block for all other operations.
358 359 360 361 362 363 364 365
 */
static noinline int
__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
{
	struct rb_root *tm_root;
	struct rb_node **new;
	struct rb_node *parent = NULL;
	struct tree_mod_elem *cur;
366

367 368
	lockdep_assert_held_write(&fs_info->tree_mod_log_lock);

J
Josef Bacik 已提交
369
	tm->seq = btrfs_inc_tree_mod_seq(fs_info);
370 371 372 373

	tm_root = &fs_info->tree_mod_log;
	new = &tm_root->rb_node;
	while (*new) {
374
		cur = rb_entry(*new, struct tree_mod_elem, node);
375
		parent = *new;
376
		if (cur->logical < tm->logical)
377
			new = &((*new)->rb_left);
378
		else if (cur->logical > tm->logical)
379
			new = &((*new)->rb_right);
380
		else if (cur->seq < tm->seq)
381
			new = &((*new)->rb_left);
382
		else if (cur->seq > tm->seq)
383
			new = &((*new)->rb_right);
384 385
		else
			return -EEXIST;
386 387 388 389
	}

	rb_link_node(&tm->node, parent, new);
	rb_insert_color(&tm->node, tm_root);
390
	return 0;
391 392
}

393 394 395 396
/*
 * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
 * returns zero with the tree_mod_log_lock acquired. The caller must hold
 * this until all tree mod log insertions are recorded in the rb tree and then
397
 * write unlock fs_info::tree_mod_log_lock.
398
 */
399 400 401 402 403
static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb) {
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 1;
404 405
	if (eb && btrfs_header_level(eb) == 0)
		return 1;
406

407
	write_lock(&fs_info->tree_mod_log_lock);
408
	if (list_empty(&(fs_info)->tree_mod_seq_list)) {
409
		write_unlock(&fs_info->tree_mod_log_lock);
410 411 412
		return 1;
	}

413 414 415
	return 0;
}

416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
				    struct extent_buffer *eb)
{
	smp_mb();
	if (list_empty(&(fs_info)->tree_mod_seq_list))
		return 0;
	if (eb && btrfs_header_level(eb) == 0)
		return 0;

	return 1;
}

static struct tree_mod_elem *
alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
		    enum mod_log_op op, gfp_t flags)
432
{
433
	struct tree_mod_elem *tm;
434

435 436
	tm = kzalloc(sizeof(*tm), flags);
	if (!tm)
437
		return NULL;
438

439
	tm->logical = eb->start;
440 441 442 443 444 445 446
	if (op != MOD_LOG_KEY_ADD) {
		btrfs_node_key(eb, &tm->key, slot);
		tm->blockptr = btrfs_node_blockptr(eb, slot);
	}
	tm->op = op;
	tm->slot = slot;
	tm->generation = btrfs_node_ptr_generation(eb, slot);
447
	RB_CLEAR_NODE(&tm->node);
448

449
	return tm;
450 451
}

452 453
static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
		enum mod_log_op op, gfp_t flags)
454
{
455 456 457
	struct tree_mod_elem *tm;
	int ret;

458
	if (!tree_mod_need_log(eb->fs_info, eb))
459 460 461 462 463 464
		return 0;

	tm = alloc_tree_mod_elem(eb, slot, op, flags);
	if (!tm)
		return -ENOMEM;

465
	if (tree_mod_dont_log(eb->fs_info, eb)) {
466
		kfree(tm);
467
		return 0;
468 469
	}

470
	ret = __tree_mod_log_insert(eb->fs_info, tm);
471
	write_unlock(&eb->fs_info->tree_mod_log_lock);
472 473
	if (ret)
		kfree(tm);
474

475
	return ret;
476 477
}

478 479
static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
		int dst_slot, int src_slot, int nr_items)
480
{
481 482 483
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int ret = 0;
484
	int i;
485
	int locked = 0;
486

487
	if (!tree_mod_need_log(eb->fs_info, eb))
J
Jan Schmidt 已提交
488
		return 0;
489

490
	tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
491 492 493
	if (!tm_list)
		return -ENOMEM;

494
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
495 496 497 498 499
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}

500
	tm->logical = eb->start;
501 502 503 504 505 506 507
	tm->slot = src_slot;
	tm->move.dst_slot = dst_slot;
	tm->move.nr_items = nr_items;
	tm->op = MOD_LOG_MOVE_KEYS;

	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
508
		    MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
509 510 511 512 513 514
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

515
	if (tree_mod_dont_log(eb->fs_info, eb))
516 517 518
		goto free_tms;
	locked = 1;

519 520 521 522 523
	/*
	 * When we override something during the move, we log these removals.
	 * This can only happen when we move towards the beginning of the
	 * buffer, i.e. dst_slot < src_slot.
	 */
524
	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
525
		ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
526 527
		if (ret)
			goto free_tms;
528 529
	}

530
	ret = __tree_mod_log_insert(eb->fs_info, tm);
531 532
	if (ret)
		goto free_tms;
533
	write_unlock(&eb->fs_info->tree_mod_log_lock);
534
	kfree(tm_list);
J
Jan Schmidt 已提交
535

536 537 538 539
	return 0;
free_tms:
	for (i = 0; i < nr_items; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
540
			rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
541 542 543
		kfree(tm_list[i]);
	}
	if (locked)
544
		write_unlock(&eb->fs_info->tree_mod_log_lock);
545 546
	kfree(tm_list);
	kfree(tm);
547

548
	return ret;
549 550
}

551 552 553 554
static inline int
__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
		       struct tree_mod_elem **tm_list,
		       int nritems)
555
{
556
	int i, j;
557 558 559
	int ret;

	for (i = nritems - 1; i >= 0; i--) {
560 561 562 563 564 565 566
		ret = __tree_mod_log_insert(fs_info, tm_list[i]);
		if (ret) {
			for (j = nritems - 1; j > i; j--)
				rb_erase(&tm_list[j]->node,
					 &fs_info->tree_mod_log);
			return ret;
		}
567
	}
568 569

	return 0;
570 571
}

572 573
static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
			 struct extent_buffer *new_root, int log_removal)
574
{
575
	struct btrfs_fs_info *fs_info = old_root->fs_info;
576 577 578 579 580
	struct tree_mod_elem *tm = NULL;
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int ret = 0;
	int i;
581

582
	if (!tree_mod_need_log(fs_info, NULL))
583 584
		return 0;

585 586
	if (log_removal && btrfs_header_level(old_root) > 0) {
		nritems = btrfs_header_nritems(old_root);
587
		tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
588
				  GFP_NOFS);
589 590 591 592 593 594
		if (!tm_list) {
			ret = -ENOMEM;
			goto free_tms;
		}
		for (i = 0; i < nritems; i++) {
			tm_list[i] = alloc_tree_mod_elem(old_root, i,
595
			    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
596 597 598 599 600 601
			if (!tm_list[i]) {
				ret = -ENOMEM;
				goto free_tms;
			}
		}
	}
602

603
	tm = kzalloc(sizeof(*tm), GFP_NOFS);
604 605 606 607
	if (!tm) {
		ret = -ENOMEM;
		goto free_tms;
	}
608

609
	tm->logical = new_root->start;
610 611 612 613 614
	tm->old_root.logical = old_root->start;
	tm->old_root.level = btrfs_header_level(old_root);
	tm->generation = btrfs_header_generation(old_root);
	tm->op = MOD_LOG_ROOT_REPLACE;

615 616 617 618 619 620 621 622
	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;

	if (tm_list)
		ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
	if (!ret)
		ret = __tree_mod_log_insert(fs_info, tm);

623
	write_unlock(&fs_info->tree_mod_log_lock);
624 625 626 627 628 629 630 631 632 633 634 635 636 637 638
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return ret;

free_tms:
	if (tm_list) {
		for (i = 0; i < nritems; i++)
			kfree(tm_list[i]);
		kfree(tm_list);
	}
	kfree(tm);

	return ret;
639 640 641 642 643 644 645 646 647 648 649
}

static struct tree_mod_elem *
__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
		      int smallest)
{
	struct rb_root *tm_root;
	struct rb_node *node;
	struct tree_mod_elem *cur = NULL;
	struct tree_mod_elem *found = NULL;

650
	read_lock(&fs_info->tree_mod_log_lock);
651 652 653
	tm_root = &fs_info->tree_mod_log;
	node = tm_root->rb_node;
	while (node) {
654
		cur = rb_entry(node, struct tree_mod_elem, node);
655
		if (cur->logical < start) {
656
			node = node->rb_left;
657
		} else if (cur->logical > start) {
658
			node = node->rb_right;
659
		} else if (cur->seq < min_seq) {
660 661 662 663
			node = node->rb_left;
		} else if (!smallest) {
			/* we want the node with the highest seq */
			if (found)
664
				BUG_ON(found->seq > cur->seq);
665 666
			found = cur;
			node = node->rb_left;
667
		} else if (cur->seq > min_seq) {
668 669
			/* we want the node with the smallest seq */
			if (found)
670
				BUG_ON(found->seq < cur->seq);
671 672 673 674 675 676 677
			found = cur;
			node = node->rb_right;
		} else {
			found = cur;
			break;
		}
	}
678
	read_unlock(&fs_info->tree_mod_log_lock);
679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705

	return found;
}

/*
 * this returns the element from the log with the smallest time sequence
 * value that's in the log (the oldest log item). any element with a time
 * sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
			   u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 1);
}

/*
 * this returns the element from the log with the largest time sequence
 * value that's in the log (the most recent log item). any element with
 * a time sequence lower than min_seq will be ignored.
 */
static struct tree_mod_elem *
tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
{
	return __tree_mod_log_search(fs_info, start, min_seq, 0);
}

706
static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
707
		     struct extent_buffer *src, unsigned long dst_offset,
708
		     unsigned long src_offset, int nr_items)
709
{
710
	struct btrfs_fs_info *fs_info = dst->fs_info;
711 712 713
	int ret = 0;
	struct tree_mod_elem **tm_list = NULL;
	struct tree_mod_elem **tm_list_add, **tm_list_rem;
714
	int i;
715
	int locked = 0;
716

717 718
	if (!tree_mod_need_log(fs_info, NULL))
		return 0;
719

720
	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
721 722
		return 0;

723
	tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
724 725 726
			  GFP_NOFS);
	if (!tm_list)
		return -ENOMEM;
727

728 729
	tm_list_add = tm_list;
	tm_list_rem = tm_list + nr_items;
730
	for (i = 0; i < nr_items; i++) {
731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756
		tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
		    MOD_LOG_KEY_REMOVE, GFP_NOFS);
		if (!tm_list_rem[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}

		tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
		    MOD_LOG_KEY_ADD, GFP_NOFS);
		if (!tm_list_add[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

	if (tree_mod_dont_log(fs_info, NULL))
		goto free_tms;
	locked = 1;

	for (i = 0; i < nr_items; i++) {
		ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
		if (ret)
			goto free_tms;
		ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
		if (ret)
			goto free_tms;
757
	}
758

759
	write_unlock(&fs_info->tree_mod_log_lock);
760 761 762 763 764 765 766 767 768 769 770
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nr_items * 2; i++) {
		if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
			rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
		kfree(tm_list[i]);
	}
	if (locked)
771
		write_unlock(&fs_info->tree_mod_log_lock);
772 773 774
	kfree(tm_list);

	return ret;
775 776
}

777
static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
778
{
779 780 781 782 783 784 785 786
	struct tree_mod_elem **tm_list = NULL;
	int nritems = 0;
	int i;
	int ret = 0;

	if (btrfs_header_level(eb) == 0)
		return 0;

787
	if (!tree_mod_need_log(eb->fs_info, NULL))
788 789 790
		return 0;

	nritems = btrfs_header_nritems(eb);
791
	tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
792 793 794 795 796 797 798 799 800 801 802 803
	if (!tm_list)
		return -ENOMEM;

	for (i = 0; i < nritems; i++) {
		tm_list[i] = alloc_tree_mod_elem(eb, i,
		    MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
		if (!tm_list[i]) {
			ret = -ENOMEM;
			goto free_tms;
		}
	}

804
	if (tree_mod_dont_log(eb->fs_info, eb))
805 806
		goto free_tms;

807
	ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
808
	write_unlock(&eb->fs_info->tree_mod_log_lock);
809 810 811 812 813 814 815 816 817 818 819 820
	if (ret)
		goto free_tms;
	kfree(tm_list);

	return 0;

free_tms:
	for (i = 0; i < nritems; i++)
		kfree(tm_list[i]);
	kfree(tm_list);

	return ret;
821 822
}

823 824 825 826 827 828 829
/*
 * check if the tree block can be shared by multiple trees
 */
int btrfs_block_can_be_shared(struct btrfs_root *root,
			      struct extent_buffer *buf)
{
	/*
830 831 832
	 * Tree blocks not in shareable trees and tree roots are never shared.
	 * If a block was allocated after the last snapshot and the block was
	 * not allocated by tree relocation, we know the block is not shared.
833
	 */
834
	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
835 836 837 838 839
	    buf != root->node && buf != root->commit_root &&
	    (btrfs_header_generation(buf) <=
	     btrfs_root_last_snapshot(&root->root_item) ||
	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
		return 1;
840

841 842 843 844 845 846
	return 0;
}

static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
				       struct btrfs_root *root,
				       struct extent_buffer *buf,
847 848
				       struct extent_buffer *cow,
				       int *last_ref)
849
{
850
	struct btrfs_fs_info *fs_info = root->fs_info;
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874
	u64 refs;
	u64 owner;
	u64 flags;
	u64 new_flags = 0;
	int ret;

	/*
	 * Backrefs update rules:
	 *
	 * Always use full backrefs for extent pointers in tree block
	 * allocated by tree relocation.
	 *
	 * If a shared tree block is no longer referenced by its owner
	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
	 * use full backrefs for extent pointers in tree block.
	 *
	 * If a tree block is been relocating
	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
	 * use full backrefs for extent pointers in tree block.
	 * The reason for this is some operations (such as drop tree)
	 * are only allowed for blocks use full backrefs.
	 */

	if (btrfs_block_can_be_shared(root, buf)) {
875
		ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
876 877
					       btrfs_header_level(buf), 1,
					       &refs, &flags);
878 879
		if (ret)
			return ret;
880 881
		if (refs == 0) {
			ret = -EROFS;
882
			btrfs_handle_fs_error(fs_info, ret, NULL);
883 884
			return ret;
		}
885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901
	} else {
		refs = 1;
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
		else
			flags = 0;
	}

	owner = btrfs_header_owner(buf);
	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));

	if (refs > 1) {
		if ((owner == root->root_key.objectid ||
		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
902
			ret = btrfs_inc_ref(trans, root, buf, 1);
903 904
			if (ret)
				return ret;
905 906 907

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
908
				ret = btrfs_dec_ref(trans, root, buf, 0);
909 910
				if (ret)
					return ret;
911
				ret = btrfs_inc_ref(trans, root, cow, 1);
912 913
				if (ret)
					return ret;
914 915 916 917 918 919
			}
			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
		} else {

			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
920
				ret = btrfs_inc_ref(trans, root, cow, 1);
921
			else
922
				ret = btrfs_inc_ref(trans, root, cow, 0);
923 924
			if (ret)
				return ret;
925 926
		}
		if (new_flags != 0) {
927 928
			int level = btrfs_header_level(buf);

929
			ret = btrfs_set_disk_extent_flags(trans, buf,
930
							  new_flags, level, 0);
931 932
			if (ret)
				return ret;
933 934 935 936 937
		}
	} else {
		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID)
938
				ret = btrfs_inc_ref(trans, root, cow, 1);
939
			else
940
				ret = btrfs_inc_ref(trans, root, cow, 0);
941 942
			if (ret)
				return ret;
943
			ret = btrfs_dec_ref(trans, root, buf, 1);
944 945
			if (ret)
				return ret;
946
		}
947
		btrfs_clean_tree_block(buf);
948
		*last_ref = 1;
949 950 951 952
	}
	return 0;
}

953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994
static struct extent_buffer *alloc_tree_block_no_bg_flush(
					  struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  u64 parent_start,
					  const struct btrfs_disk_key *disk_key,
					  int level,
					  u64 hint,
					  u64 empty_size)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *ret;

	/*
	 * If we are COWing a node/leaf from the extent, chunk, device or free
	 * space trees, make sure that we do not finish block group creation of
	 * pending block groups. We do this to avoid a deadlock.
	 * COWing can result in allocation of a new chunk, and flushing pending
	 * block groups (btrfs_create_pending_block_groups()) can be triggered
	 * when finishing allocation of a new chunk. Creation of a pending block
	 * group modifies the extent, chunk, device and free space trees,
	 * therefore we could deadlock with ourselves since we are holding a
	 * lock on an extent buffer that btrfs_create_pending_block_groups() may
	 * try to COW later.
	 * For similar reasons, we also need to delay flushing pending block
	 * groups when splitting a leaf or node, from one of those trees, since
	 * we are holding a write lock on it and its parent or when inserting a
	 * new root node for one of those trees.
	 */
	if (root == fs_info->extent_root ||
	    root == fs_info->chunk_root ||
	    root == fs_info->dev_root ||
	    root == fs_info->free_space_root)
		trans->can_flush_pending_bgs = false;

	ret = btrfs_alloc_tree_block(trans, root, parent_start,
				     root->root_key.objectid, disk_key, level,
				     hint, empty_size);
	trans->can_flush_pending_bgs = true;

	return ret;
}

C
Chris Mason 已提交
995
/*
C
Chris Mason 已提交
996 997 998 999
 * does the dirty work in cow of a single block.  The parent block (if
 * supplied) is updated to point to the new cow copy.  The new buffer is marked
 * dirty and returned locked.  If you modify the block it needs to be marked
 * dirty again.
C
Chris Mason 已提交
1000 1001 1002
 *
 * search_start -- an allocation hint for the new block
 *
C
Chris Mason 已提交
1003 1004 1005
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
C
Chris Mason 已提交
1006
 */
C
Chris Mason 已提交
1007
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1008 1009 1010 1011
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
1012
			     u64 search_start, u64 empty_size)
C
Chris Mason 已提交
1013
{
1014
	struct btrfs_fs_info *fs_info = root->fs_info;
1015
	struct btrfs_disk_key disk_key;
1016
	struct extent_buffer *cow;
1017
	int level, ret;
1018
	int last_ref = 0;
1019
	int unlock_orig = 0;
1020
	u64 parent_start = 0;
1021

1022 1023 1024
	if (*cow_ret == buf)
		unlock_orig = 1;

1025
	btrfs_assert_tree_locked(buf);
1026

1027
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1028
		trans->transid != fs_info->running_transaction->transid);
1029
	WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1030
		trans->transid != root->last_trans);
1031

1032
	level = btrfs_header_level(buf);
Z
Zheng Yan 已提交
1033

1034 1035 1036 1037 1038
	if (level == 0)
		btrfs_item_key(buf, &disk_key, 0);
	else
		btrfs_node_key(buf, &disk_key, 0);

1039 1040
	if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
		parent_start = parent->start;
1041

1042 1043
	cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
					   level, search_start, empty_size);
1044 1045
	if (IS_ERR(cow))
		return PTR_ERR(cow);
1046

1047 1048
	/* cow is set to blocking by btrfs_init_new_buffer */

1049
	copy_extent_buffer_full(cow, buf);
1050
	btrfs_set_header_bytenr(cow, cow->start);
1051
	btrfs_set_header_generation(cow, trans->transid);
1052 1053 1054 1055 1056 1057 1058
	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
				     BTRFS_HEADER_FLAG_RELOC);
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
	else
		btrfs_set_header_owner(cow, root->root_key.objectid);
1059

1060
	write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
Y
Yan Zheng 已提交
1061

1062
	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1063
	if (ret) {
1064
		btrfs_abort_transaction(trans, ret);
1065 1066
		return ret;
	}
Z
Zheng Yan 已提交
1067

1068
	if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1069
		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1070
		if (ret) {
1071
			btrfs_abort_transaction(trans, ret);
1072
			return ret;
1073
		}
1074
	}
1075

C
Chris Mason 已提交
1076
	if (buf == root->node) {
1077
		WARN_ON(parent && parent != buf);
1078 1079 1080
		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
			parent_start = buf->start;
1081

D
David Sterba 已提交
1082
		atomic_inc(&cow->refs);
1083 1084
		ret = tree_mod_log_insert_root(root->node, cow, 1);
		BUG_ON(ret < 0);
1085
		rcu_assign_pointer(root->node, cow);
1086

1087
		btrfs_free_tree_block(trans, root, buf, parent_start,
1088
				      last_ref);
1089
		free_extent_buffer(buf);
1090
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
1091
	} else {
1092
		WARN_ON(trans->transid != btrfs_header_generation(parent));
1093
		tree_mod_log_insert_key(parent, parent_slot,
1094
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
1095
		btrfs_set_node_blockptr(parent, parent_slot,
1096
					cow->start);
1097 1098
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
1099
		btrfs_mark_buffer_dirty(parent);
1100
		if (last_ref) {
1101
			ret = tree_mod_log_free_eb(buf);
1102
			if (ret) {
1103
				btrfs_abort_transaction(trans, ret);
1104 1105 1106
				return ret;
			}
		}
1107
		btrfs_free_tree_block(trans, root, buf, parent_start,
1108
				      last_ref);
C
Chris Mason 已提交
1109
	}
1110 1111
	if (unlock_orig)
		btrfs_tree_unlock(buf);
1112
	free_extent_buffer_stale(buf);
C
Chris Mason 已提交
1113
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
1114
	*cow_ret = cow;
C
Chris Mason 已提交
1115 1116 1117
	return 0;
}

J
Jan Schmidt 已提交
1118 1119 1120 1121
/*
 * returns the logical address of the oldest predecessor of the given root.
 * entries older than time_seq are ignored.
 */
1122 1123
static struct tree_mod_elem *__tree_mod_log_oldest_root(
		struct extent_buffer *eb_root, u64 time_seq)
J
Jan Schmidt 已提交
1124 1125 1126
{
	struct tree_mod_elem *tm;
	struct tree_mod_elem *found = NULL;
1127
	u64 root_logical = eb_root->start;
J
Jan Schmidt 已提交
1128 1129 1130
	int looped = 0;

	if (!time_seq)
1131
		return NULL;
J
Jan Schmidt 已提交
1132 1133

	/*
1134 1135 1136 1137
	 * the very last operation that's logged for a root is the
	 * replacement operation (if it is replaced at all). this has
	 * the logical address of the *new* root, making it the very
	 * first operation that's logged for this root.
J
Jan Schmidt 已提交
1138 1139
	 */
	while (1) {
1140
		tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
J
Jan Schmidt 已提交
1141 1142
						time_seq);
		if (!looped && !tm)
1143
			return NULL;
J
Jan Schmidt 已提交
1144
		/*
1145 1146 1147
		 * if there are no tree operation for the oldest root, we simply
		 * return it. this should only happen if that (old) root is at
		 * level 0.
J
Jan Schmidt 已提交
1148
		 */
1149 1150
		if (!tm)
			break;
J
Jan Schmidt 已提交
1151

1152 1153 1154 1155 1156
		/*
		 * if there's an operation that's not a root replacement, we
		 * found the oldest version of our root. normally, we'll find a
		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
		 */
J
Jan Schmidt 已提交
1157 1158 1159 1160 1161 1162 1163 1164
		if (tm->op != MOD_LOG_ROOT_REPLACE)
			break;

		found = tm;
		root_logical = tm->old_root.logical;
		looped = 1;
	}

1165 1166 1167 1168
	/* if there's no old root to return, return what we found instead */
	if (!found)
		found = tm;

J
Jan Schmidt 已提交
1169 1170 1171 1172 1173
	return found;
}

/*
 * tm is a pointer to the first operation to rewind within eb. then, all
1174
 * previous operations will be rewound (until we reach something older than
J
Jan Schmidt 已提交
1175 1176 1177
 * time_seq).
 */
static void
1178 1179
__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
		      u64 time_seq, struct tree_mod_elem *first_tm)
J
Jan Schmidt 已提交
1180 1181 1182 1183 1184 1185 1186 1187 1188
{
	u32 n;
	struct rb_node *next;
	struct tree_mod_elem *tm = first_tm;
	unsigned long o_dst;
	unsigned long o_src;
	unsigned long p_size = sizeof(struct btrfs_key_ptr);

	n = btrfs_header_nritems(eb);
1189
	read_lock(&fs_info->tree_mod_log_lock);
1190
	while (tm && tm->seq >= time_seq) {
J
Jan Schmidt 已提交
1191 1192 1193 1194 1195 1196 1197 1198
		/*
		 * all the operations are recorded with the operator used for
		 * the modification. as we're going backwards, we do the
		 * opposite of each operation here.
		 */
		switch (tm->op) {
		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
			BUG_ON(tm->slot < n);
1199
			fallthrough;
1200
		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1201
		case MOD_LOG_KEY_REMOVE:
J
Jan Schmidt 已提交
1202 1203 1204 1205
			btrfs_set_node_key(eb, &tm->key, tm->slot);
			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
			btrfs_set_node_ptr_generation(eb, tm->slot,
						      tm->generation);
1206
			n++;
J
Jan Schmidt 已提交
1207 1208 1209 1210 1211 1212 1213 1214 1215
			break;
		case MOD_LOG_KEY_REPLACE:
			BUG_ON(tm->slot >= n);
			btrfs_set_node_key(eb, &tm->key, tm->slot);
			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
			btrfs_set_node_ptr_generation(eb, tm->slot,
						      tm->generation);
			break;
		case MOD_LOG_KEY_ADD:
1216
			/* if a move operation is needed it's in the log */
J
Jan Schmidt 已提交
1217 1218 1219
			n--;
			break;
		case MOD_LOG_MOVE_KEYS:
1220 1221 1222
			o_dst = btrfs_node_key_ptr_offset(tm->slot);
			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
			memmove_extent_buffer(eb, o_dst, o_src,
J
Jan Schmidt 已提交
1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
					      tm->move.nr_items * p_size);
			break;
		case MOD_LOG_ROOT_REPLACE:
			/*
			 * this operation is special. for roots, this must be
			 * handled explicitly before rewinding.
			 * for non-roots, this operation may exist if the node
			 * was a root: root A -> child B; then A gets empty and
			 * B is promoted to the new root. in the mod log, we'll
			 * have a root-replace operation for B, a tree block
			 * that is no root. we simply ignore that operation.
			 */
			break;
		}
		next = rb_next(&tm->node);
		if (!next)
			break;
1240
		tm = rb_entry(next, struct tree_mod_elem, node);
1241
		if (tm->logical != first_tm->logical)
J
Jan Schmidt 已提交
1242 1243
			break;
	}
1244
	read_unlock(&fs_info->tree_mod_log_lock);
J
Jan Schmidt 已提交
1245 1246 1247
	btrfs_set_header_nritems(eb, n);
}

1248
/*
1249
 * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1250 1251 1252 1253 1254
 * is returned. If rewind operations happen, a fresh buffer is returned. The
 * returned buffer is always read-locked. If the returned buffer is not the
 * input buffer, the lock on the input buffer is released and the input buffer
 * is freed (its refcount is decremented).
 */
J
Jan Schmidt 已提交
1255
static struct extent_buffer *
1256 1257
tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
		    struct extent_buffer *eb, u64 time_seq)
J
Jan Schmidt 已提交
1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
{
	struct extent_buffer *eb_rewin;
	struct tree_mod_elem *tm;

	if (!time_seq)
		return eb;

	if (btrfs_header_level(eb) == 0)
		return eb;

	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
	if (!tm)
		return eb;

1272
	btrfs_set_path_blocking(path);
1273
	btrfs_set_lock_blocking_read(eb);
1274

J
Jan Schmidt 已提交
1275 1276
	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
		BUG_ON(tm->slot != 0);
1277
		eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1278
		if (!eb_rewin) {
1279
			btrfs_tree_read_unlock_blocking(eb);
1280 1281 1282
			free_extent_buffer(eb);
			return NULL;
		}
J
Jan Schmidt 已提交
1283 1284 1285 1286
		btrfs_set_header_bytenr(eb_rewin, eb->start);
		btrfs_set_header_backref_rev(eb_rewin,
					     btrfs_header_backref_rev(eb));
		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1287
		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
J
Jan Schmidt 已提交
1288 1289
	} else {
		eb_rewin = btrfs_clone_extent_buffer(eb);
1290
		if (!eb_rewin) {
1291
			btrfs_tree_read_unlock_blocking(eb);
1292 1293 1294
			free_extent_buffer(eb);
			return NULL;
		}
J
Jan Schmidt 已提交
1295 1296
	}

1297
	btrfs_tree_read_unlock_blocking(eb);
J
Jan Schmidt 已提交
1298 1299
	free_extent_buffer(eb);

1300 1301
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
				       eb_rewin, btrfs_header_level(eb_rewin));
1302
	btrfs_tree_read_lock(eb_rewin);
1303
	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1304
	WARN_ON(btrfs_header_nritems(eb_rewin) >
1305
		BTRFS_NODEPTRS_PER_BLOCK(fs_info));
J
Jan Schmidt 已提交
1306 1307 1308 1309

	return eb_rewin;
}

1310 1311 1312 1313 1314 1315 1316
/*
 * get_old_root() rewinds the state of @root's root node to the given @time_seq
 * value. If there are no changes, the current root->root_node is returned. If
 * anything changed in between, there's a fresh buffer allocated on which the
 * rewind operations are done. In any case, the returned buffer is read locked.
 * Returns NULL on error (with no locks held).
 */
J
Jan Schmidt 已提交
1317 1318 1319
static inline struct extent_buffer *
get_old_root(struct btrfs_root *root, u64 time_seq)
{
1320
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Jan Schmidt 已提交
1321
	struct tree_mod_elem *tm;
1322 1323
	struct extent_buffer *eb = NULL;
	struct extent_buffer *eb_root;
1324
	u64 eb_root_owner = 0;
1325
	struct extent_buffer *old;
1326
	struct tree_mod_root *old_root = NULL;
1327
	u64 old_generation = 0;
1328
	u64 logical;
1329
	int level;
J
Jan Schmidt 已提交
1330

1331
	eb_root = btrfs_read_lock_root_node(root);
1332
	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
J
Jan Schmidt 已提交
1333
	if (!tm)
1334
		return eb_root;
J
Jan Schmidt 已提交
1335

1336 1337 1338 1339
	if (tm->op == MOD_LOG_ROOT_REPLACE) {
		old_root = &tm->old_root;
		old_generation = tm->generation;
		logical = old_root->logical;
1340
		level = old_root->level;
1341
	} else {
1342
		logical = eb_root->start;
1343
		level = btrfs_header_level(eb_root);
1344
	}
J
Jan Schmidt 已提交
1345

1346
	tm = tree_mod_log_search(fs_info, logical, time_seq);
1347
	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1348 1349
		btrfs_tree_read_unlock(eb_root);
		free_extent_buffer(eb_root);
1350
		old = read_tree_block(fs_info, logical, 0, level, NULL);
1351 1352 1353
		if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
			if (!IS_ERR(old))
				free_extent_buffer(old);
1354 1355 1356
			btrfs_warn(fs_info,
				   "failed to read tree block %llu from get_old_root",
				   logical);
1357
		} else {
1358 1359
			eb = btrfs_clone_extent_buffer(old);
			free_extent_buffer(old);
1360 1361
		}
	} else if (old_root) {
1362
		eb_root_owner = btrfs_header_owner(eb_root);
1363 1364
		btrfs_tree_read_unlock(eb_root);
		free_extent_buffer(eb_root);
1365
		eb = alloc_dummy_extent_buffer(fs_info, logical);
1366
	} else {
1367
		btrfs_set_lock_blocking_read(eb_root);
1368
		eb = btrfs_clone_extent_buffer(eb_root);
1369
		btrfs_tree_read_unlock_blocking(eb_root);
1370
		free_extent_buffer(eb_root);
1371 1372
	}

1373 1374
	if (!eb)
		return NULL;
1375
	if (old_root) {
J
Jan Schmidt 已提交
1376 1377
		btrfs_set_header_bytenr(eb, eb->start);
		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1378
		btrfs_set_header_owner(eb, eb_root_owner);
1379 1380
		btrfs_set_header_level(eb, old_root->level);
		btrfs_set_header_generation(eb, old_generation);
J
Jan Schmidt 已提交
1381
	}
1382 1383 1384
	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
				       btrfs_header_level(eb));
	btrfs_tree_read_lock(eb);
1385
	if (tm)
1386
		__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1387 1388
	else
		WARN_ON(btrfs_header_level(eb) != 0);
1389
	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
J
Jan Schmidt 已提交
1390 1391 1392 1393

	return eb;
}

J
Jan Schmidt 已提交
1394 1395 1396 1397
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
{
	struct tree_mod_elem *tm;
	int level;
1398
	struct extent_buffer *eb_root = btrfs_root_node(root);
J
Jan Schmidt 已提交
1399

1400
	tm = __tree_mod_log_oldest_root(eb_root, time_seq);
J
Jan Schmidt 已提交
1401 1402 1403
	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
		level = tm->old_root.level;
	} else {
1404
		level = btrfs_header_level(eb_root);
J
Jan Schmidt 已提交
1405
	}
1406
	free_extent_buffer(eb_root);
J
Jan Schmidt 已提交
1407 1408 1409 1410

	return level;
}

1411 1412 1413 1414
static inline int should_cow_block(struct btrfs_trans_handle *trans,
				   struct btrfs_root *root,
				   struct extent_buffer *buf)
{
1415
	if (btrfs_is_testing(root->fs_info))
1416
		return 0;
1417

1418 1419
	/* Ensure we can see the FORCE_COW bit */
	smp_mb__before_atomic();
1420 1421 1422 1423 1424 1425 1426 1427

	/*
	 * We do not need to cow a block if
	 * 1) this block is not created or changed in this transaction;
	 * 2) this block does not belong to TREE_RELOC tree;
	 * 3) the root is not forced COW.
	 *
	 * What is forced COW:
1428
	 *    when we create snapshot during committing the transaction,
1429
	 *    after we've finished copying src root, we must COW the shared
1430 1431
	 *    block to ensure the metadata consistency.
	 */
1432 1433 1434
	if (btrfs_header_generation(buf) == trans->transid &&
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1435
	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1436
	    !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1437 1438 1439 1440
		return 0;
	return 1;
}

C
Chris Mason 已提交
1441 1442
/*
 * cows a single block, see __btrfs_cow_block for the real work.
1443
 * This version of it has extra checks so that a block isn't COWed more than
C
Chris Mason 已提交
1444 1445
 * once per transaction, as long as it hasn't been written yet
 */
C
Chris Mason 已提交
1446
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1447 1448
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
1449
		    struct extent_buffer **cow_ret)
1450
{
1451
	struct btrfs_fs_info *fs_info = root->fs_info;
1452
	u64 search_start;
1453
	int ret;
C
Chris Mason 已提交
1454

1455 1456 1457 1458
	if (test_bit(BTRFS_ROOT_DELETING, &root->state))
		btrfs_err(fs_info,
			"COW'ing blocks on a fs root that's being dropped");

1459
	if (trans->transaction != fs_info->running_transaction)
J
Julia Lawall 已提交
1460
		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1461
		       trans->transid,
1462
		       fs_info->running_transaction->transid);
J
Julia Lawall 已提交
1463

1464
	if (trans->transid != fs_info->generation)
J
Julia Lawall 已提交
1465
		WARN(1, KERN_CRIT "trans %llu running %llu\n",
1466
		       trans->transid, fs_info->generation);
C
Chris Mason 已提交
1467

1468
	if (!should_cow_block(trans, root, buf)) {
1469
		trans->dirty = true;
1470 1471 1472
		*cow_ret = buf;
		return 0;
	}
1473

1474
	search_start = buf->start & ~((u64)SZ_1G - 1);
1475 1476

	if (parent)
1477 1478
		btrfs_set_lock_blocking_write(parent);
	btrfs_set_lock_blocking_write(buf);
1479

1480 1481 1482 1483 1484 1485 1486
	/*
	 * Before CoWing this block for later modification, check if it's
	 * the subtree root and do the delayed subtree trace if needed.
	 *
	 * Also We don't care about the error, as it's handled internally.
	 */
	btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1487
	ret = __btrfs_cow_block(trans, root, buf, parent,
1488
				 parent_slot, cow_ret, search_start, 0);
1489 1490 1491

	trace_btrfs_cow_block(root, buf, *cow_ret);

1492
	return ret;
1493 1494
}

C
Chris Mason 已提交
1495 1496 1497 1498
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
1499
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1500
{
1501
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
1502
		return 1;
1503
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
1504 1505 1506 1507
		return 1;
	return 0;
}

1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523
#ifdef __LITTLE_ENDIAN

/*
 * Compare two keys, on little-endian the disk order is same as CPU order and
 * we can avoid the conversion.
 */
static int comp_keys(const struct btrfs_disk_key *disk_key,
		     const struct btrfs_key *k2)
{
	const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;

	return btrfs_comp_cpu_keys(k1, k2);
}

#else

1524 1525 1526
/*
 * compare two keys in a memcmp fashion
 */
1527 1528
static int comp_keys(const struct btrfs_disk_key *disk,
		     const struct btrfs_key *k2)
1529 1530 1531 1532 1533
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

1534
	return btrfs_comp_cpu_keys(&k1, k2);
1535
}
1536
#endif
1537

1538 1539 1540
/*
 * same as comp_keys only with two btrfs_key's
 */
1541
int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
1557

C
Chris Mason 已提交
1558 1559 1560 1561 1562
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
1563
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1564
		       struct btrfs_root *root, struct extent_buffer *parent,
1565
		       int start_slot, u64 *last_ret,
1566
		       struct btrfs_key *progress)
1567
{
1568
	struct btrfs_fs_info *fs_info = root->fs_info;
1569
	struct extent_buffer *cur;
1570
	u64 blocknr;
1571
	u64 gen;
1572 1573
	u64 search_start = *last_ret;
	u64 last_block = 0;
1574 1575 1576 1577 1578
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
1579
	int parent_level;
1580 1581
	int uptodate;
	u32 blocksize;
1582 1583
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
1584

1585 1586
	parent_level = btrfs_header_level(parent);

1587 1588
	WARN_ON(trans->transaction != fs_info->running_transaction);
	WARN_ON(trans->transid != fs_info->generation);
1589

1590
	parent_nritems = btrfs_header_nritems(parent);
1591
	blocksize = fs_info->nodesize;
1592
	end_slot = parent_nritems - 1;
1593

1594
	if (parent_nritems <= 1)
1595 1596
		return 0;

1597
	btrfs_set_lock_blocking_write(parent);
1598

1599
	for (i = start_slot; i <= end_slot; i++) {
1600
		struct btrfs_key first_key;
1601
		int close = 1;
1602

1603 1604 1605 1606 1607
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
1608
		blocknr = btrfs_node_blockptr(parent, i);
1609
		gen = btrfs_node_ptr_generation(parent, i);
1610
		btrfs_node_key_to_cpu(parent, &first_key, i);
1611 1612
		if (last_block == 0)
			last_block = blocknr;
1613

1614
		if (i > 0) {
1615 1616
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
1617
		}
1618
		if (!close && i < end_slot) {
1619 1620
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
1621
		}
1622 1623
		if (close) {
			last_block = blocknr;
1624
			continue;
1625
		}
1626

1627
		cur = find_extent_buffer(fs_info, blocknr);
1628
		if (cur)
1629
			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1630 1631
		else
			uptodate = 0;
1632
		if (!cur || !uptodate) {
1633
			if (!cur) {
1634 1635 1636
				cur = read_tree_block(fs_info, blocknr, gen,
						      parent_level - 1,
						      &first_key);
1637 1638 1639
				if (IS_ERR(cur)) {
					return PTR_ERR(cur);
				} else if (!extent_buffer_uptodate(cur)) {
1640
					free_extent_buffer(cur);
1641
					return -EIO;
1642
				}
1643
			} else if (!uptodate) {
1644 1645
				err = btrfs_read_buffer(cur, gen,
						parent_level - 1,&first_key);
1646 1647 1648 1649
				if (err) {
					free_extent_buffer(cur);
					return err;
				}
1650
			}
1651
		}
1652
		if (search_start == 0)
1653
			search_start = last_block;
1654

1655
		btrfs_tree_lock(cur);
1656
		btrfs_set_lock_blocking_write(cur);
1657
		err = __btrfs_cow_block(trans, root, cur, parent, i,
1658
					&cur, search_start,
1659
					min(16 * blocksize,
1660
					    (end_slot - i) * blocksize));
Y
Yan 已提交
1661
		if (err) {
1662
			btrfs_tree_unlock(cur);
1663
			free_extent_buffer(cur);
1664
			break;
Y
Yan 已提交
1665
		}
1666 1667
		search_start = cur->start;
		last_block = cur->start;
1668
		*last_ret = search_start;
1669 1670
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
1671 1672 1673 1674
	}
	return err;
}

C
Chris Mason 已提交
1675
/*
1676 1677 1678
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
1679 1680 1681 1682 1683 1684
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
1685
static noinline int generic_bin_search(struct extent_buffer *eb,
1686 1687
				       unsigned long p, int item_size,
				       const struct btrfs_key *key,
1688
				       int max, int *slot)
1689 1690 1691 1692
{
	int low = 0;
	int high = max;
	int ret;
1693
	const int key_size = sizeof(struct btrfs_disk_key);
1694

1695 1696 1697 1698 1699 1700 1701 1702
	if (low > high) {
		btrfs_err(eb->fs_info,
		 "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
			  __func__, low, high, eb->start,
			  btrfs_header_owner(eb), btrfs_header_level(eb));
		return -EINVAL;
	}

C
Chris Mason 已提交
1703
	while (low < high) {
1704 1705 1706 1707 1708 1709
		unsigned long oip;
		unsigned long offset;
		struct btrfs_disk_key *tmp;
		struct btrfs_disk_key unaligned;
		int mid;

1710
		mid = (low + high) / 2;
1711
		offset = p + mid * item_size;
1712
		oip = offset_in_page(offset);
1713

1714 1715 1716
		if (oip + key_size <= PAGE_SIZE) {
			const unsigned long idx = offset >> PAGE_SHIFT;
			char *kaddr = page_address(eb->pages[idx]);
1717

1718
			tmp = (struct btrfs_disk_key *)(kaddr + oip);
1719
		} else {
1720 1721
			read_extent_buffer(eb, &unaligned, offset, key_size);
			tmp = &unaligned;
1722
		}
1723

1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
			return 0;
		}
	}
	*slot = low;
	return 1;
}

C
Chris Mason 已提交
1739 1740 1741 1742
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
1743
int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1744
		     int *slot)
1745
{
1746
	if (btrfs_header_level(eb) == 0)
1747 1748
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
1749
					  sizeof(struct btrfs_item),
1750
					  key, btrfs_header_nritems(eb),
1751
					  slot);
1752
	else
1753 1754
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
1755
					  sizeof(struct btrfs_key_ptr),
1756
					  key, btrfs_header_nritems(eb),
1757
					  slot);
1758 1759
}

1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775
static void root_add_used(struct btrfs_root *root, u32 size)
{
	spin_lock(&root->accounting_lock);
	btrfs_set_root_used(&root->root_item,
			    btrfs_root_used(&root->root_item) + size);
	spin_unlock(&root->accounting_lock);
}

static void root_sub_used(struct btrfs_root *root, u32 size)
{
	spin_lock(&root->accounting_lock);
	btrfs_set_root_used(&root->root_item,
			    btrfs_root_used(&root->root_item) - size);
	spin_unlock(&root->accounting_lock);
}

C
Chris Mason 已提交
1776 1777 1778
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 */
1779 1780
struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
					   int slot)
1781
{
1782
	int level = btrfs_header_level(parent);
1783
	struct extent_buffer *eb;
1784
	struct btrfs_key first_key;
1785

1786 1787
	if (slot < 0 || slot >= btrfs_header_nritems(parent))
		return ERR_PTR(-ENOENT);
1788 1789 1790

	BUG_ON(level == 0);

1791
	btrfs_node_key_to_cpu(parent, &first_key, slot);
1792
	eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1793 1794
			     btrfs_node_ptr_generation(parent, slot),
			     level - 1, &first_key);
1795 1796 1797
	if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
		free_extent_buffer(eb);
		eb = ERR_PTR(-EIO);
1798 1799 1800
	}

	return eb;
1801 1802
}

C
Chris Mason 已提交
1803 1804 1805 1806 1807
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
1808
static noinline int balance_level(struct btrfs_trans_handle *trans,
1809 1810
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
1811
{
1812
	struct btrfs_fs_info *fs_info = root->fs_info;
1813 1814 1815 1816
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1817 1818 1819 1820
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
1821
	u64 orig_ptr;
1822

1823
	ASSERT(level > 0);
1824

1825
	mid = path->nodes[level];
1826

1827 1828
	WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
		path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1829 1830
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

1831
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1832

L
Li Zefan 已提交
1833
	if (level < BTRFS_MAX_LEVEL - 1) {
1834
		parent = path->nodes[level + 1];
L
Li Zefan 已提交
1835 1836
		pslot = path->slots[level + 1];
	}
1837

C
Chris Mason 已提交
1838 1839 1840 1841
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
1842 1843
	if (!parent) {
		struct extent_buffer *child;
1844

1845
		if (btrfs_header_nritems(mid) != 1)
1846 1847 1848
			return 0;

		/* promote the child to a root */
1849
		child = btrfs_read_node_slot(mid, 0);
1850 1851
		if (IS_ERR(child)) {
			ret = PTR_ERR(child);
1852
			btrfs_handle_fs_error(fs_info, ret, NULL);
1853 1854 1855
			goto enospc;
		}

1856
		btrfs_tree_lock(child);
1857
		btrfs_set_lock_blocking_write(child);
1858
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1859 1860 1861 1862 1863
		if (ret) {
			btrfs_tree_unlock(child);
			free_extent_buffer(child);
			goto enospc;
		}
1864

1865 1866
		ret = tree_mod_log_insert_root(root->node, child, 1);
		BUG_ON(ret < 0);
1867
		rcu_assign_pointer(root->node, child);
1868

1869
		add_root_to_dirty_list(root);
1870
		btrfs_tree_unlock(child);
1871

1872
		path->locks[level] = 0;
1873
		path->nodes[level] = NULL;
1874
		btrfs_clean_tree_block(mid);
1875
		btrfs_tree_unlock(mid);
1876
		/* once for the path */
1877
		free_extent_buffer(mid);
1878 1879

		root_sub_used(root, mid->len);
1880
		btrfs_free_tree_block(trans, root, mid, 0, 1);
1881
		/* once for the root ptr */
1882
		free_extent_buffer_stale(mid);
1883
		return 0;
1884
	}
1885
	if (btrfs_header_nritems(mid) >
1886
	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1887 1888
		return 0;

1889
	left = btrfs_read_node_slot(parent, pslot - 1);
1890 1891 1892
	if (IS_ERR(left))
		left = NULL;

1893
	if (left) {
1894
		btrfs_tree_lock(left);
1895
		btrfs_set_lock_blocking_write(left);
1896
		wret = btrfs_cow_block(trans, root, left,
1897
				       parent, pslot - 1, &left);
1898 1899 1900 1901
		if (wret) {
			ret = wret;
			goto enospc;
		}
1902
	}
1903

1904
	right = btrfs_read_node_slot(parent, pslot + 1);
1905 1906 1907
	if (IS_ERR(right))
		right = NULL;

1908
	if (right) {
1909
		btrfs_tree_lock(right);
1910
		btrfs_set_lock_blocking_write(right);
1911
		wret = btrfs_cow_block(trans, root, right,
1912
				       parent, pslot + 1, &right);
1913 1914 1915 1916 1917 1918 1919
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
1920 1921
	if (left) {
		orig_slot += btrfs_header_nritems(left);
1922
		wret = push_node_left(trans, left, mid, 1);
1923 1924
		if (wret < 0)
			ret = wret;
1925
	}
1926 1927 1928 1929

	/*
	 * then try to empty the right most buffer into the middle
	 */
1930
	if (right) {
1931
		wret = push_node_left(trans, mid, right, 1);
1932
		if (wret < 0 && wret != -ENOSPC)
1933
			ret = wret;
1934
		if (btrfs_header_nritems(right) == 0) {
1935
			btrfs_clean_tree_block(right);
1936
			btrfs_tree_unlock(right);
1937
			del_ptr(root, path, level + 1, pslot + 1);
1938
			root_sub_used(root, right->len);
1939
			btrfs_free_tree_block(trans, root, right, 0, 1);
1940
			free_extent_buffer_stale(right);
1941
			right = NULL;
1942
		} else {
1943 1944
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
1945 1946 1947
			ret = tree_mod_log_insert_key(parent, pslot + 1,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
1948 1949
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1950 1951
		}
	}
1952
	if (btrfs_header_nritems(mid) == 1) {
1953 1954 1955 1956 1957 1958 1959 1960 1961
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1962 1963
		if (!left) {
			ret = -EROFS;
1964
			btrfs_handle_fs_error(fs_info, ret, NULL);
1965 1966
			goto enospc;
		}
1967
		wret = balance_node_right(trans, mid, left);
1968
		if (wret < 0) {
1969
			ret = wret;
1970 1971
			goto enospc;
		}
1972
		if (wret == 1) {
1973
			wret = push_node_left(trans, left, mid, 1);
1974 1975 1976
			if (wret < 0)
				ret = wret;
		}
1977 1978
		BUG_ON(wret == 1);
	}
1979
	if (btrfs_header_nritems(mid) == 0) {
1980
		btrfs_clean_tree_block(mid);
1981
		btrfs_tree_unlock(mid);
1982
		del_ptr(root, path, level + 1, pslot);
1983
		root_sub_used(root, mid->len);
1984
		btrfs_free_tree_block(trans, root, mid, 0, 1);
1985
		free_extent_buffer_stale(mid);
1986
		mid = NULL;
1987 1988
	} else {
		/* update the parent key to reflect our changes */
1989 1990
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
1991 1992 1993
		ret = tree_mod_log_insert_key(parent, pslot,
				MOD_LOG_KEY_REPLACE, GFP_NOFS);
		BUG_ON(ret < 0);
1994 1995
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1996
	}
1997

1998
	/* update the path */
1999 2000
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
D
David Sterba 已提交
2001
			atomic_inc(&left->refs);
2002
			/* left was locked after cow */
2003
			path->nodes[level] = left;
2004 2005
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
2006 2007
			if (mid) {
				btrfs_tree_unlock(mid);
2008
				free_extent_buffer(mid);
2009
			}
2010
		} else {
2011
			orig_slot -= btrfs_header_nritems(left);
2012 2013 2014
			path->slots[level] = orig_slot;
		}
	}
2015
	/* double check we haven't messed things up */
C
Chris Mason 已提交
2016
	if (orig_ptr !=
2017
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2018
		BUG();
2019
enospc:
2020 2021
	if (right) {
		btrfs_tree_unlock(right);
2022
		free_extent_buffer(right);
2023 2024 2025 2026
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
2027
		free_extent_buffer(left);
2028
	}
2029 2030 2031
	return ret;
}

C
Chris Mason 已提交
2032 2033 2034 2035
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
C
Chris Mason 已提交
2036
static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2037 2038
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
2039
{
2040
	struct btrfs_fs_info *fs_info = root->fs_info;
2041 2042 2043 2044
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
2045 2046 2047 2048 2049 2050 2051 2052
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];

	if (level == 0)
		return 1;

2053
	mid = path->nodes[level];
2054
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
2055

L
Li Zefan 已提交
2056
	if (level < BTRFS_MAX_LEVEL - 1) {
2057
		parent = path->nodes[level + 1];
L
Li Zefan 已提交
2058 2059
		pslot = path->slots[level + 1];
	}
2060

2061
	if (!parent)
2062 2063
		return 1;

2064
	left = btrfs_read_node_slot(parent, pslot - 1);
2065 2066
	if (IS_ERR(left))
		left = NULL;
2067 2068

	/* first, try to make some room in the middle buffer */
2069
	if (left) {
2070
		u32 left_nr;
2071 2072

		btrfs_tree_lock(left);
2073
		btrfs_set_lock_blocking_write(left);
2074

2075
		left_nr = btrfs_header_nritems(left);
2076
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
C
Chris Mason 已提交
2077 2078
			wret = 1;
		} else {
2079
			ret = btrfs_cow_block(trans, root, left, parent,
2080
					      pslot - 1, &left);
2081 2082 2083
			if (ret)
				wret = 1;
			else {
2084
				wret = push_node_left(trans, left, mid, 0);
2085
			}
C
Chris Mason 已提交
2086
		}
2087 2088 2089
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
2090
			struct btrfs_disk_key disk_key;
2091
			orig_slot += left_nr;
2092
			btrfs_node_key(mid, &disk_key, 0);
2093 2094 2095
			ret = tree_mod_log_insert_key(parent, pslot,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
2096 2097 2098 2099
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
2100 2101
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
2102
				btrfs_tree_unlock(mid);
2103
				free_extent_buffer(mid);
2104 2105
			} else {
				orig_slot -=
2106
					btrfs_header_nritems(left);
2107
				path->slots[level] = orig_slot;
2108
				btrfs_tree_unlock(left);
2109
				free_extent_buffer(left);
2110 2111 2112
			}
			return 0;
		}
2113
		btrfs_tree_unlock(left);
2114
		free_extent_buffer(left);
2115
	}
2116
	right = btrfs_read_node_slot(parent, pslot + 1);
2117 2118
	if (IS_ERR(right))
		right = NULL;
2119 2120 2121 2122

	/*
	 * then try to empty the right most buffer into the middle
	 */
2123
	if (right) {
C
Chris Mason 已提交
2124
		u32 right_nr;
2125

2126
		btrfs_tree_lock(right);
2127
		btrfs_set_lock_blocking_write(right);
2128

2129
		right_nr = btrfs_header_nritems(right);
2130
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
C
Chris Mason 已提交
2131 2132
			wret = 1;
		} else {
2133 2134
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
2135
					      &right);
2136 2137 2138
			if (ret)
				wret = 1;
			else {
2139
				wret = balance_node_right(trans, right, mid);
2140
			}
C
Chris Mason 已提交
2141
		}
2142 2143 2144
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
2145 2146 2147
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
2148 2149 2150
			ret = tree_mod_log_insert_key(parent, pslot + 1,
					MOD_LOG_KEY_REPLACE, GFP_NOFS);
			BUG_ON(ret < 0);
2151 2152 2153 2154 2155
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
2156 2157
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
2158
					btrfs_header_nritems(mid);
2159
				btrfs_tree_unlock(mid);
2160
				free_extent_buffer(mid);
2161
			} else {
2162
				btrfs_tree_unlock(right);
2163
				free_extent_buffer(right);
2164 2165 2166
			}
			return 0;
		}
2167
		btrfs_tree_unlock(right);
2168
		free_extent_buffer(right);
2169 2170 2171 2172
	}
	return 1;
}

2173
/*
C
Chris Mason 已提交
2174 2175
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
2176
 */
2177
static void reada_for_search(struct btrfs_fs_info *fs_info,
2178 2179
			     struct btrfs_path *path,
			     int level, int slot, u64 objectid)
2180
{
2181
	struct extent_buffer *node;
2182
	struct btrfs_disk_key disk_key;
2183 2184
	u32 nritems;
	u64 search;
2185
	u64 target;
2186
	u64 nread = 0;
2187
	struct extent_buffer *eb;
2188 2189 2190
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
2191

2192
	if (level != 1)
2193 2194 2195
		return;

	if (!path->nodes[level])
2196 2197
		return;

2198
	node = path->nodes[level];
2199

2200
	search = btrfs_node_blockptr(node, slot);
2201 2202
	blocksize = fs_info->nodesize;
	eb = find_extent_buffer(fs_info, search);
2203 2204
	if (eb) {
		free_extent_buffer(eb);
2205 2206 2207
		return;
	}

2208
	target = search;
2209

2210
	nritems = btrfs_header_nritems(node);
2211
	nr = slot;
2212

C
Chris Mason 已提交
2213
	while (1) {
2214
		if (path->reada == READA_BACK) {
2215 2216 2217
			if (nr == 0)
				break;
			nr--;
2218
		} else if (path->reada == READA_FORWARD) {
2219 2220 2221
			nr++;
			if (nr >= nritems)
				break;
2222
		}
2223
		if (path->reada == READA_BACK && objectid) {
2224 2225 2226 2227
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
2228
		search = btrfs_node_blockptr(node, nr);
2229 2230
		if ((search <= target && target - search <= 65536) ||
		    (search > target && search - target <= 65536)) {
2231
			readahead_tree_block(fs_info, search);
2232 2233 2234
			nread += blocksize;
		}
		nscan++;
2235
		if ((nread > 65536 || nscan > 32))
2236
			break;
2237 2238
	}
}
2239

2240
static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
J
Josef Bacik 已提交
2241
				       struct btrfs_path *path, int level)
2242 2243 2244 2245 2246 2247 2248 2249 2250
{
	int slot;
	int nritems;
	struct extent_buffer *parent;
	struct extent_buffer *eb;
	u64 gen;
	u64 block1 = 0;
	u64 block2 = 0;

2251
	parent = path->nodes[level + 1];
2252
	if (!parent)
J
Josef Bacik 已提交
2253
		return;
2254 2255

	nritems = btrfs_header_nritems(parent);
2256
	slot = path->slots[level + 1];
2257 2258 2259 2260

	if (slot > 0) {
		block1 = btrfs_node_blockptr(parent, slot - 1);
		gen = btrfs_node_ptr_generation(parent, slot - 1);
2261
		eb = find_extent_buffer(fs_info, block1);
2262 2263 2264 2265 2266 2267
		/*
		 * if we get -eagain from btrfs_buffer_uptodate, we
		 * don't want to return eagain here.  That will loop
		 * forever
		 */
		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2268 2269 2270
			block1 = 0;
		free_extent_buffer(eb);
	}
2271
	if (slot + 1 < nritems) {
2272 2273
		block2 = btrfs_node_blockptr(parent, slot + 1);
		gen = btrfs_node_ptr_generation(parent, slot + 1);
2274
		eb = find_extent_buffer(fs_info, block2);
2275
		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2276 2277 2278
			block2 = 0;
		free_extent_buffer(eb);
	}
2279

J
Josef Bacik 已提交
2280
	if (block1)
2281
		readahead_tree_block(fs_info, block1);
J
Josef Bacik 已提交
2282
	if (block2)
2283
		readahead_tree_block(fs_info, block2);
2284 2285 2286
}


C
Chris Mason 已提交
2287
/*
C
Chris Mason 已提交
2288 2289 2290 2291
 * when we walk down the tree, it is usually safe to unlock the higher layers
 * in the tree.  The exceptions are when our path goes through slot 0, because
 * operations on the tree might require changing key pointers higher up in the
 * tree.
C
Chris Mason 已提交
2292
 *
C
Chris Mason 已提交
2293 2294 2295
 * callers might also have set path->keep_locks, which tells this code to keep
 * the lock if the path points to the last slot in the block.  This is part of
 * walking through the tree, and selecting the next slot in the higher block.
C
Chris Mason 已提交
2296
 *
C
Chris Mason 已提交
2297 2298
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
 * if lowest_unlock is 1, level 0 won't be unlocked
C
Chris Mason 已提交
2299
 */
2300
static noinline void unlock_up(struct btrfs_path *path, int level,
2301 2302
			       int lowest_unlock, int min_write_lock_level,
			       int *write_lock_level)
2303 2304 2305
{
	int i;
	int skip_level = level;
2306
	int no_skips = 0;
2307 2308 2309 2310 2311 2312 2313
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
2314
		if (!no_skips && path->slots[i] == 0) {
2315 2316 2317
			skip_level = i + 1;
			continue;
		}
2318
		if (!no_skips && path->keep_locks) {
2319 2320 2321
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
2322
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
2323 2324 2325 2326
				skip_level = i + 1;
				continue;
			}
		}
2327 2328 2329
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

2330
		t = path->nodes[i];
2331
		if (i >= lowest_unlock && i > skip_level) {
2332
			btrfs_tree_unlock_rw(t, path->locks[i]);
2333
			path->locks[i] = 0;
2334 2335 2336 2337 2338
			if (write_lock_level &&
			    i > min_write_lock_level &&
			    i <= *write_lock_level) {
				*write_lock_level = i - 1;
			}
2339 2340 2341 2342
		}
	}
}

2343 2344 2345 2346 2347 2348 2349 2350 2351
/*
 * helper function for btrfs_search_slot.  The goal is to find a block
 * in cache without setting the path to blocking.  If we find the block
 * we return zero and the path is unchanged.
 *
 * If we can't find the block, we set the path blocking and do some
 * reada.  -EAGAIN is returned and the search must be repeated.
 */
static int
2352 2353
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
		      struct extent_buffer **eb_ret, int level, int slot,
2354
		      const struct btrfs_key *key)
2355
{
2356
	struct btrfs_fs_info *fs_info = root->fs_info;
2357 2358 2359
	u64 blocknr;
	u64 gen;
	struct extent_buffer *tmp;
2360
	struct btrfs_key first_key;
2361
	int ret;
2362
	int parent_level;
2363

2364 2365 2366 2367
	blocknr = btrfs_node_blockptr(*eb_ret, slot);
	gen = btrfs_node_ptr_generation(*eb_ret, slot);
	parent_level = btrfs_header_level(*eb_ret);
	btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2368

2369
	tmp = find_extent_buffer(fs_info, blocknr);
2370
	if (tmp) {
2371
		/* first we do an atomic uptodate check */
2372
		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2373 2374 2375 2376 2377
			/*
			 * Do extra check for first_key, eb can be stale due to
			 * being cached, read from scrub, or have multiple
			 * parents (shared tree blocks).
			 */
2378
			if (btrfs_verify_level_key(tmp,
2379 2380 2381 2382
					parent_level - 1, &first_key, gen)) {
				free_extent_buffer(tmp);
				return -EUCLEAN;
			}
2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395
			*eb_ret = tmp;
			return 0;
		}

		/* the pages were up to date, but we failed
		 * the generation number check.  Do a full
		 * read for the generation number that is correct.
		 * We must do this without dropping locks so
		 * we can trust our generation number
		 */
		btrfs_set_path_blocking(p);

		/* now we're allowed to do a blocking uptodate check */
2396
		ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2397 2398 2399
		if (!ret) {
			*eb_ret = tmp;
			return 0;
2400
		}
2401 2402 2403
		free_extent_buffer(tmp);
		btrfs_release_path(p);
		return -EIO;
2404 2405 2406 2407 2408
	}

	/*
	 * reduce lock contention at high levels
	 * of the btree by dropping locks before
2409 2410 2411
	 * we read.  Don't release the lock on the current
	 * level because we need to walk this node to figure
	 * out which blocks to read.
2412
	 */
2413 2414 2415
	btrfs_unlock_up_safe(p, level + 1);
	btrfs_set_path_blocking(p);

2416
	if (p->reada != READA_NONE)
2417
		reada_for_search(fs_info, p, level, slot, key->objectid);
2418

2419
	ret = -EAGAIN;
2420
	tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2421
			      &first_key);
2422
	if (!IS_ERR(tmp)) {
2423 2424 2425 2426 2427 2428
		/*
		 * If the read above didn't mark this buffer up to date,
		 * it will never end up being up to date.  Set ret to EIO now
		 * and give up so that our caller doesn't loop forever
		 * on our EAGAINs.
		 */
2429
		if (!extent_buffer_uptodate(tmp))
2430
			ret = -EIO;
2431
		free_extent_buffer(tmp);
2432 2433
	} else {
		ret = PTR_ERR(tmp);
2434
	}
2435 2436

	btrfs_release_path(p);
2437
	return ret;
2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451
}

/*
 * helper function for btrfs_search_slot.  This does all of the checks
 * for node-level blocks and does any balancing required based on
 * the ins_len.
 *
 * If no extra work was required, zero is returned.  If we had to
 * drop the path, -EAGAIN is returned and btrfs_search_slot must
 * start over
 */
static int
setup_nodes_for_search(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root, struct btrfs_path *p,
2452 2453
		       struct extent_buffer *b, int level, int ins_len,
		       int *write_lock_level)
2454
{
2455
	struct btrfs_fs_info *fs_info = root->fs_info;
2456
	int ret;
2457

2458
	if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2459
	    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2460 2461
		int sret;

2462 2463 2464 2465 2466 2467
		if (*write_lock_level < level + 1) {
			*write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

2468
		btrfs_set_path_blocking(p);
2469
		reada_for_balance(fs_info, p, level);
2470 2471 2472 2473 2474 2475 2476 2477 2478
		sret = split_node(trans, root, p, level);

		BUG_ON(sret > 0);
		if (sret) {
			ret = sret;
			goto done;
		}
		b = p->nodes[level];
	} else if (ins_len < 0 && btrfs_header_nritems(b) <
2479
		   BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2480 2481
		int sret;

2482 2483 2484 2485 2486 2487
		if (*write_lock_level < level + 1) {
			*write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

2488
		btrfs_set_path_blocking(p);
2489
		reada_for_balance(fs_info, p, level);
2490 2491 2492 2493 2494 2495 2496 2497
		sret = balance_level(trans, root, p, level);

		if (sret) {
			ret = sret;
			goto done;
		}
		b = p->nodes[level];
		if (!b) {
2498
			btrfs_release_path(p);
2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510
			goto again;
		}
		BUG_ON(btrfs_header_nritems(b) == 1);
	}
	return 0;

again:
	ret = -EAGAIN;
done:
	return ret;
}

2511
int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2512 2513 2514 2515 2516 2517
		u64 iobjectid, u64 ioff, u8 key_type,
		struct btrfs_key *found_key)
{
	int ret;
	struct btrfs_key key;
	struct extent_buffer *eb;
2518 2519

	ASSERT(path);
2520
	ASSERT(found_key);
2521 2522 2523 2524 2525 2526

	key.type = key_type;
	key.objectid = iobjectid;
	key.offset = ioff;

	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2527
	if (ret < 0)
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545
		return ret;

	eb = path->nodes[0];
	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
		ret = btrfs_next_leaf(fs_root, path);
		if (ret)
			return ret;
		eb = path->nodes[0];
	}

	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
	if (found_key->type != key.type ||
			found_key->objectid != key.objectid)
		return 1;

	return 0;
}

2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558
static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
							struct btrfs_path *p,
							int write_lock_level)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct extent_buffer *b;
	int root_lock;
	int level = 0;

	/* We try very hard to do read locks on the root */
	root_lock = BTRFS_READ_LOCK;

	if (p->search_commit_root) {
2559 2560 2561 2562 2563 2564 2565 2566 2567 2568
		/*
		 * The commit roots are read only so we always do read locks,
		 * and we always must hold the commit_root_sem when doing
		 * searches on them, the only exception is send where we don't
		 * want to block transaction commits for a long time, so
		 * we need to clone the commit root in order to avoid races
		 * with transaction commits that create a snapshot of one of
		 * the roots used by a send operation.
		 */
		if (p->need_commit_sem) {
2569
			down_read(&fs_info->commit_root_sem);
2570
			b = btrfs_clone_extent_buffer(root->commit_root);
2571
			up_read(&fs_info->commit_root_sem);
2572 2573 2574 2575 2576
			if (!b)
				return ERR_PTR(-ENOMEM);

		} else {
			b = root->commit_root;
D
David Sterba 已提交
2577
			atomic_inc(&b->refs);
2578 2579
		}
		level = btrfs_header_level(b);
2580 2581 2582 2583 2584
		/*
		 * Ensure that all callers have set skip_locking when
		 * p->search_commit_root = 1.
		 */
		ASSERT(p->skip_locking == 1);
2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595

		goto out;
	}

	if (p->skip_locking) {
		b = btrfs_root_node(root);
		level = btrfs_header_level(b);
		goto out;
	}

	/*
2596 2597
	 * If the level is set to maximum, we can skip trying to get the read
	 * lock.
2598
	 */
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612
	if (write_lock_level < BTRFS_MAX_LEVEL) {
		/*
		 * We don't know the level of the root node until we actually
		 * have it read locked
		 */
		b = btrfs_read_lock_root_node(root);
		level = btrfs_header_level(b);
		if (level > write_lock_level)
			goto out;

		/* Whoops, must trade for write lock */
		btrfs_tree_read_unlock(b);
		free_extent_buffer(b);
	}
2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630

	b = btrfs_lock_root_node(root);
	root_lock = BTRFS_WRITE_LOCK;

	/* The level might have changed, check again */
	level = btrfs_header_level(b);

out:
	p->nodes[level] = b;
	if (!p->skip_locking)
		p->locks[level] = root_lock;
	/*
	 * Callers are responsible for dropping b's references.
	 */
	return b;
}


C
Chris Mason 已提交
2631
/*
2632 2633
 * btrfs_search_slot - look for a key in a tree and perform necessary
 * modifications to preserve tree invariants.
C
Chris Mason 已提交
2634
 *
2635 2636 2637 2638 2639 2640 2641 2642
 * @trans:	Handle of transaction, used when modifying the tree
 * @p:		Holds all btree nodes along the search path
 * @root:	The root node of the tree
 * @key:	The key we are looking for
 * @ins_len:	Indicates purpose of search, for inserts it is 1, for
 *		deletions it's -1. 0 for plain searches
 * @cow:	boolean should CoW operations be performed. Must always be 1
 *		when modifying the tree.
C
Chris Mason 已提交
2643
 *
2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654
 * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
 * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
 *
 * If @key is found, 0 is returned and you can find the item in the leaf level
 * of the path (level 0)
 *
 * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
 * points to the slot where it should be inserted
 *
 * If an error is encountered while searching the tree a negative error number
 * is returned
C
Chris Mason 已提交
2655
 */
2656 2657 2658
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *key, struct btrfs_path *p,
		      int ins_len, int cow)
2659
{
2660
	struct extent_buffer *b;
2661 2662
	int slot;
	int ret;
2663
	int err;
2664
	int level;
2665
	int lowest_unlock = 1;
2666 2667
	/* everything at write_lock_level or lower must be write locked */
	int write_lock_level = 0;
2668
	u8 lowest_level = 0;
2669
	int min_write_lock_level;
2670
	int prev_cmp;
2671

2672
	lowest_level = p->lowest_level;
2673
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
2674
	WARN_ON(p->nodes[0] != NULL);
2675
	BUG_ON(!cow && ins_len);
2676

2677
	if (ins_len < 0) {
2678
		lowest_unlock = 2;
2679

2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695
		/* when we are removing items, we might have to go up to level
		 * two as we update tree pointers  Make sure we keep write
		 * for those levels as well
		 */
		write_lock_level = 2;
	} else if (ins_len > 0) {
		/*
		 * for inserting items, make sure we have a write lock on
		 * level 1 so we can update keys
		 */
		write_lock_level = 1;
	}

	if (!cow)
		write_lock_level = -1;

J
Josef Bacik 已提交
2696
	if (cow && (p->keep_locks || p->lowest_level))
2697 2698
		write_lock_level = BTRFS_MAX_LEVEL;

2699 2700
	min_write_lock_level = write_lock_level;

2701
again:
2702
	prev_cmp = -1;
2703
	b = btrfs_search_slot_get_root(root, p, write_lock_level);
2704 2705 2706 2707
	if (IS_ERR(b)) {
		ret = PTR_ERR(b);
		goto done;
	}
2708

2709
	while (b) {
2710 2711
		int dec = 0;

2712
		level = btrfs_header_level(b);
2713

C
Chris Mason 已提交
2714
		if (cow) {
2715 2716
			bool last_level = (level == (BTRFS_MAX_LEVEL - 1));

2717 2718 2719 2720 2721
			/*
			 * if we don't really need to cow this block
			 * then we don't want to set the path blocking,
			 * so we test it here
			 */
2722 2723
			if (!should_cow_block(trans, root, b)) {
				trans->dirty = true;
2724
				goto cow_done;
2725
			}
2726

2727 2728 2729 2730
			/*
			 * must have write locks on this node and the
			 * parent
			 */
2731 2732 2733 2734
			if (level > write_lock_level ||
			    (level + 1 > write_lock_level &&
			    level + 1 < BTRFS_MAX_LEVEL &&
			    p->nodes[level + 1])) {
2735 2736 2737 2738 2739
				write_lock_level = level + 1;
				btrfs_release_path(p);
				goto again;
			}

2740
			btrfs_set_path_blocking(p);
2741 2742 2743 2744 2745 2746 2747
			if (last_level)
				err = btrfs_cow_block(trans, root, b, NULL, 0,
						      &b);
			else
				err = btrfs_cow_block(trans, root, b,
						      p->nodes[level + 1],
						      p->slots[level + 1], &b);
2748 2749
			if (err) {
				ret = err;
2750
				goto done;
2751
			}
C
Chris Mason 已提交
2752
		}
2753
cow_done:
2754
		p->nodes[level] = b;
L
Liu Bo 已提交
2755 2756 2757 2758
		/*
		 * Leave path with blocking locks to avoid massive
		 * lock context switch, this is made on purpose.
		 */
2759 2760 2761 2762 2763 2764 2765

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 *
2766 2767 2768 2769
		 * If we're inserting or deleting (ins_len != 0), then we might
		 * be changing slot zero, which may require changing the parent.
		 * So, we can't drop the lock until after we know which slot
		 * we're operating on.
2770
		 */
2771 2772 2773 2774 2775 2776 2777 2778
		if (!ins_len && !p->keep_locks) {
			int u = level + 1;

			if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
				btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
				p->locks[u] = 0;
			}
		}
2779

N
Nikolay Borisov 已提交
2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796
		/*
		 * If btrfs_bin_search returns an exact match (prev_cmp == 0)
		 * we can safely assume the target key will always be in slot 0
		 * on lower levels due to the invariants BTRFS' btree provides,
		 * namely that a btrfs_key_ptr entry always points to the
		 * lowest key in the child node, thus we can skip searching
		 * lower levels
		 */
		if (prev_cmp == 0) {
			slot = 0;
			ret = 0;
		} else {
			ret = btrfs_bin_search(b, key, &slot);
			prev_cmp = ret;
			if (ret < 0)
				goto done;
		}
2797

2798
		if (level == 0) {
2799
			p->slots[level] = slot;
2800
			if (ins_len > 0 &&
2801
			    btrfs_leaf_free_space(b) < ins_len) {
2802 2803 2804 2805 2806 2807
				if (write_lock_level < 1) {
					write_lock_level = 1;
					btrfs_release_path(p);
					goto again;
				}

2808
				btrfs_set_path_blocking(p);
2809 2810
				err = split_leaf(trans, root, key,
						 p, ins_len, ret == 0);
2811

2812 2813 2814
				BUG_ON(err > 0);
				if (err) {
					ret = err;
2815 2816
					goto done;
				}
C
Chris Mason 已提交
2817
			}
2818
			if (!p->search_for_split)
2819
				unlock_up(p, level, lowest_unlock,
2820
					  min_write_lock_level, NULL);
2821
			goto done;
2822
		}
2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883
		if (ret && slot > 0) {
			dec = 1;
			slot--;
		}
		p->slots[level] = slot;
		err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
					     &write_lock_level);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
			goto done;
		}
		b = p->nodes[level];
		slot = p->slots[level];

		/*
		 * Slot 0 is special, if we change the key we have to update
		 * the parent pointer which means we must have a write lock on
		 * the parent
		 */
		if (slot == 0 && ins_len && write_lock_level < level + 1) {
			write_lock_level = level + 1;
			btrfs_release_path(p);
			goto again;
		}

		unlock_up(p, level, lowest_unlock, min_write_lock_level,
			  &write_lock_level);

		if (level == lowest_level) {
			if (dec)
				p->slots[level]++;
			goto done;
		}

		err = read_block_for_search(root, p, &b, level, slot, key);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
			goto done;
		}

		if (!p->skip_locking) {
			level = btrfs_header_level(b);
			if (level <= write_lock_level) {
				if (!btrfs_try_tree_write_lock(b)) {
					btrfs_set_path_blocking(p);
					btrfs_tree_lock(b);
				}
				p->locks[level] = BTRFS_WRITE_LOCK;
			} else {
				if (!btrfs_tree_read_lock_atomic(b)) {
					btrfs_set_path_blocking(p);
					btrfs_tree_read_lock(b);
				}
				p->locks[level] = BTRFS_READ_LOCK;
			}
			p->nodes[level] = b;
		}
2884
	}
2885 2886
	ret = 1;
done:
2887 2888 2889 2890
	/*
	 * we don't really know what they plan on doing with the path
	 * from here on, so for now just mark it as blocking
	 */
2891 2892
	if (!p->leave_spinning)
		btrfs_set_path_blocking(p);
2893
	if (ret < 0 && !p->skip_release_on_error)
2894
		btrfs_release_path(p);
2895
	return ret;
2896 2897
}

J
Jan Schmidt 已提交
2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908
/*
 * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
 * current state of the tree together with the operations recorded in the tree
 * modification log to search for the key in a previous version of this tree, as
 * denoted by the time_seq parameter.
 *
 * Naturally, there is no support for insert, delete or cow operations.
 *
 * The resulting path and return value will be set up as if we called
 * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
 */
2909
int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
J
Jan Schmidt 已提交
2910 2911
			  struct btrfs_path *p, u64 time_seq)
{
2912
	struct btrfs_fs_info *fs_info = root->fs_info;
J
Jan Schmidt 已提交
2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930
	struct extent_buffer *b;
	int slot;
	int ret;
	int err;
	int level;
	int lowest_unlock = 1;
	u8 lowest_level = 0;

	lowest_level = p->lowest_level;
	WARN_ON(p->nodes[0] != NULL);

	if (p->search_commit_root) {
		BUG_ON(time_seq);
		return btrfs_search_slot(NULL, root, key, p, 0, 0);
	}

again:
	b = get_old_root(root, time_seq);
2931 2932 2933 2934
	if (!b) {
		ret = -EIO;
		goto done;
	}
J
Jan Schmidt 已提交
2935 2936 2937 2938
	level = btrfs_header_level(b);
	p->locks[level] = BTRFS_READ_LOCK;

	while (b) {
2939 2940
		int dec = 0;

J
Jan Schmidt 已提交
2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951
		level = btrfs_header_level(b);
		p->nodes[level] = b;

		/*
		 * we have a lock on b and as long as we aren't changing
		 * the tree, there is no way to for the items in b to change.
		 * It is safe to drop the lock on our parent before we
		 * go through the expensive btree search on b.
		 */
		btrfs_unlock_up_safe(p, level + 1);

N
Nikolay Borisov 已提交
2952
		ret = btrfs_bin_search(b, key, &slot);
2953 2954
		if (ret < 0)
			goto done;
J
Jan Schmidt 已提交
2955

2956
		if (level == 0) {
J
Jan Schmidt 已提交
2957 2958
			p->slots[level] = slot;
			unlock_up(p, level, lowest_unlock, 0, NULL);
2959 2960
			goto done;
		}
J
Jan Schmidt 已提交
2961

2962 2963 2964 2965 2966 2967
		if (ret && slot > 0) {
			dec = 1;
			slot--;
		}
		p->slots[level] = slot;
		unlock_up(p, level, lowest_unlock, 0, NULL);
J
Jan Schmidt 已提交
2968

2969 2970 2971 2972 2973
		if (level == lowest_level) {
			if (dec)
				p->slots[level]++;
			goto done;
		}
J
Jan Schmidt 已提交
2974

2975 2976 2977 2978 2979
		err = read_block_for_search(root, p, &b, level, slot, key);
		if (err == -EAGAIN)
			goto again;
		if (err) {
			ret = err;
J
Jan Schmidt 已提交
2980 2981
			goto done;
		}
2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994

		level = btrfs_header_level(b);
		if (!btrfs_tree_read_lock_atomic(b)) {
			btrfs_set_path_blocking(p);
			btrfs_tree_read_lock(b);
		}
		b = tree_mod_log_rewind(fs_info, p, b, time_seq);
		if (!b) {
			ret = -ENOMEM;
			goto done;
		}
		p->locks[level] = BTRFS_READ_LOCK;
		p->nodes[level] = b;
J
Jan Schmidt 已提交
2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005
	}
	ret = 1;
done:
	if (!p->leave_spinning)
		btrfs_set_path_blocking(p);
	if (ret < 0)
		btrfs_release_path(p);

	return ret;
}

3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018
/*
 * helper to use instead of search slot if no exact match is needed but
 * instead the next or previous item should be returned.
 * When find_higher is true, the next higher item is returned, the next lower
 * otherwise.
 * When return_any and find_higher are both true, and no higher item is found,
 * return the next lower instead.
 * When return_any is true and find_higher is false, and no lower item is found,
 * return the next higher instead.
 * It returns 0 if any item is found, 1 if none is found (tree empty), and
 * < 0 on error
 */
int btrfs_search_slot_for_read(struct btrfs_root *root,
3019 3020 3021
			       const struct btrfs_key *key,
			       struct btrfs_path *p, int find_higher,
			       int return_any)
3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055
{
	int ret;
	struct extent_buffer *leaf;

again:
	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
	if (ret <= 0)
		return ret;
	/*
	 * a return value of 1 means the path is at the position where the
	 * item should be inserted. Normally this is the next bigger item,
	 * but in case the previous item is the last in a leaf, path points
	 * to the first free slot in the previous leaf, i.e. at an invalid
	 * item.
	 */
	leaf = p->nodes[0];

	if (find_higher) {
		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, p);
			if (ret <= 0)
				return ret;
			if (!return_any)
				return 1;
			/*
			 * no higher item found, return the next
			 * lower instead
			 */
			return_any = 0;
			find_higher = 0;
			btrfs_release_path(p);
			goto again;
		}
	} else {
3056 3057 3058 3059 3060
		if (p->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, p);
			if (ret < 0)
				return ret;
			if (!ret) {
3061 3062 3063
				leaf = p->nodes[0];
				if (p->slots[0] == btrfs_header_nritems(leaf))
					p->slots[0]--;
3064
				return 0;
3065
			}
3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076
			if (!return_any)
				return 1;
			/*
			 * no lower item found, return the next
			 * higher instead
			 */
			return_any = 0;
			find_higher = 1;
			btrfs_release_path(p);
			goto again;
		} else {
3077 3078 3079 3080 3081 3082
			--p->slots[0];
		}
	}
	return 0;
}

C
Chris Mason 已提交
3083 3084 3085 3086 3087 3088
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
3089
 *
C
Chris Mason 已提交
3090
 */
3091
static void fixup_low_keys(struct btrfs_path *path,
3092
			   struct btrfs_disk_key *key, int level)
3093 3094
{
	int i;
3095
	struct extent_buffer *t;
3096
	int ret;
3097

C
Chris Mason 已提交
3098
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3099
		int tslot = path->slots[i];
3100

3101
		if (!path->nodes[i])
3102
			break;
3103
		t = path->nodes[i];
3104 3105 3106
		ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
				GFP_ATOMIC);
		BUG_ON(ret < 0);
3107
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
3108
		btrfs_mark_buffer_dirty(path->nodes[i]);
3109 3110 3111 3112 3113
		if (tslot != 0)
			break;
	}
}

Z
Zheng Yan 已提交
3114 3115 3116 3117 3118 3119
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
3120 3121
void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
			     struct btrfs_path *path,
3122
			     const struct btrfs_key *new_key)
Z
Zheng Yan 已提交
3123 3124 3125 3126 3127 3128 3129 3130 3131
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142
		if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
			btrfs_crit(fs_info,
		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
				   slot, btrfs_disk_key_objectid(&disk_key),
				   btrfs_disk_key_type(&disk_key),
				   btrfs_disk_key_offset(&disk_key),
				   new_key->objectid, new_key->type,
				   new_key->offset);
			btrfs_print_leaf(eb);
			BUG();
		}
Z
Zheng Yan 已提交
3143 3144 3145
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156
		if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
			btrfs_crit(fs_info,
		"slot %u key (%llu %u %llu) new key (%llu %u %llu)",
				   slot, btrfs_disk_key_objectid(&disk_key),
				   btrfs_disk_key_type(&disk_key),
				   btrfs_disk_key_offset(&disk_key),
				   new_key->objectid, new_key->type,
				   new_key->offset);
			btrfs_print_leaf(eb);
			BUG();
		}
Z
Zheng Yan 已提交
3157 3158 3159 3160 3161 3162
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
3163
		fixup_low_keys(path, &disk_key, 1);
Z
Zheng Yan 已提交
3164 3165
}

C
Chris Mason 已提交
3166 3167
/*
 * try to push data from one node into the next node left in the
3168
 * tree.
C
Chris Mason 已提交
3169 3170 3171
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
3172
 */
3173
static int push_node_left(struct btrfs_trans_handle *trans,
3174
			  struct extent_buffer *dst,
3175
			  struct extent_buffer *src, int empty)
3176
{
3177
	struct btrfs_fs_info *fs_info = trans->fs_info;
3178
	int push_items = 0;
3179 3180
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
3181
	int ret = 0;
3182

3183 3184
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
3185
	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3186 3187
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
3188

3189
	if (!empty && src_nritems <= 8)
3190 3191
		return 1;

C
Chris Mason 已提交
3192
	if (push_items <= 0)
3193 3194
		return 1;

3195
	if (empty) {
3196
		push_items = min(src_nritems, push_items);
3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
3209

3210
	ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3211
	if (ret) {
3212
		btrfs_abort_transaction(trans, ret);
3213 3214
		return ret;
	}
3215 3216 3217
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
C
Chris Mason 已提交
3218
			   push_items * sizeof(struct btrfs_key_ptr));
3219

3220
	if (push_items < src_nritems) {
3221
		/*
3222 3223
		 * Don't call tree_mod_log_insert_move here, key removal was
		 * already fully logged by tree_mod_log_eb_copy above.
3224
		 */
3225 3226 3227 3228 3229 3230 3231 3232 3233
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
3234

3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
3247 3248 3249
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
3250
{
3251
	struct btrfs_fs_info *fs_info = trans->fs_info;
3252 3253 3254 3255 3256 3257
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

3258 3259 3260
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

3261 3262
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
3263
	push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
C
Chris Mason 已提交
3264
	if (push_items <= 0)
3265
		return 1;
3266

C
Chris Mason 已提交
3267
	if (src_nritems < 4)
3268
		return 1;
3269 3270 3271

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
C
Chris Mason 已提交
3272
	if (max_push >= src_nritems)
3273
		return 1;
Y
Yan 已提交
3274

3275 3276 3277
	if (max_push < push_items)
		push_items = max_push;

3278 3279
	ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
	BUG_ON(ret < 0);
3280 3281 3282 3283
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
3284

3285 3286
	ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
				   push_items);
3287
	if (ret) {
3288
		btrfs_abort_transaction(trans, ret);
3289 3290
		return ret;
	}
3291 3292 3293
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
C
Chris Mason 已提交
3294
			   push_items * sizeof(struct btrfs_key_ptr));
3295

3296 3297
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
3298

3299 3300
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
3301

C
Chris Mason 已提交
3302
	return ret;
3303 3304
}

C
Chris Mason 已提交
3305 3306 3307 3308
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
3309 3310
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
3311
 */
C
Chris Mason 已提交
3312
static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3313
			   struct btrfs_root *root,
3314
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
3315
{
3316
	struct btrfs_fs_info *fs_info = root->fs_info;
3317
	u64 lower_gen;
3318 3319
	struct extent_buffer *lower;
	struct extent_buffer *c;
3320
	struct extent_buffer *old;
3321
	struct btrfs_disk_key lower_key;
3322
	int ret;
C
Chris Mason 已提交
3323 3324 3325 3326

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

3327 3328 3329 3330 3331 3332
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

3333 3334
	c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
					 root->node->start, 0);
3335 3336
	if (IS_ERR(c))
		return PTR_ERR(c);
3337

3338
	root_add_used(root, fs_info->nodesize);
3339

3340 3341
	btrfs_set_header_nritems(c, 1);
	btrfs_set_node_key(c, &lower_key, 0);
3342
	btrfs_set_node_blockptr(c, 0, lower->start);
3343
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
3344
	WARN_ON(lower_gen != trans->transid);
3345 3346

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
3347

3348
	btrfs_mark_buffer_dirty(c);
3349

3350
	old = root->node;
3351 3352
	ret = tree_mod_log_insert_root(root->node, c, 0);
	BUG_ON(ret < 0);
3353
	rcu_assign_pointer(root->node, c);
3354 3355 3356 3357

	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

3358
	add_root_to_dirty_list(root);
D
David Sterba 已提交
3359
	atomic_inc(&c->refs);
3360
	path->nodes[level] = c;
3361
	path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
C
Chris Mason 已提交
3362 3363 3364 3365
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
3366 3367 3368
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
3369
 *
C
Chris Mason 已提交
3370 3371 3372
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
 */
3373
static void insert_ptr(struct btrfs_trans_handle *trans,
3374
		       struct btrfs_path *path,
3375
		       struct btrfs_disk_key *key, u64 bytenr,
3376
		       int slot, int level)
C
Chris Mason 已提交
3377
{
3378
	struct extent_buffer *lower;
C
Chris Mason 已提交
3379
	int nritems;
3380
	int ret;
C
Chris Mason 已提交
3381 3382

	BUG_ON(!path->nodes[level]);
3383
	btrfs_assert_tree_locked(path->nodes[level]);
3384 3385
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
S
Stoyan Gaydarov 已提交
3386
	BUG_ON(slot > nritems);
3387
	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
C
Chris Mason 已提交
3388
	if (slot != nritems) {
3389 3390
		if (level) {
			ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3391
					nritems - slot);
3392 3393
			BUG_ON(ret < 0);
		}
3394 3395 3396
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
3397
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
3398
	}
3399
	if (level) {
3400 3401
		ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
				GFP_NOFS);
3402 3403
		BUG_ON(ret < 0);
	}
3404
	btrfs_set_node_key(lower, key, slot);
3405
	btrfs_set_node_blockptr(lower, slot, bytenr);
3406 3407
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3408 3409
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
3410 3411
}

C
Chris Mason 已提交
3412 3413 3414 3415 3416 3417
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
3418 3419
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
3420
 */
3421 3422 3423
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
3424
{
3425
	struct btrfs_fs_info *fs_info = root->fs_info;
3426 3427 3428
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
3429
	int mid;
C
Chris Mason 已提交
3430
	int ret;
3431
	u32 c_nritems;
3432

3433
	c = path->nodes[level];
3434
	WARN_ON(btrfs_header_generation(c) != trans->transid);
3435
	if (c == root->node) {
3436
		/*
3437 3438
		 * trying to split the root, lets make a new one
		 *
3439
		 * tree mod log: We don't log_removal old root in
3440 3441 3442 3443 3444
		 * insert_new_root, because that root buffer will be kept as a
		 * normal node. We are going to log removal of half of the
		 * elements below with tree_mod_log_eb_copy. We're holding a
		 * tree lock on the buffer, which is why we cannot race with
		 * other tree_mod_log users.
3445
		 */
3446
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
3447 3448
		if (ret)
			return ret;
3449
	} else {
3450
		ret = push_nodes_for_insert(trans, root, path, level);
3451 3452
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
3453
		    BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3454
			return 0;
3455 3456
		if (ret < 0)
			return ret;
3457
	}
3458

3459
	c_nritems = btrfs_header_nritems(c);
3460 3461
	mid = (c_nritems + 1) / 2;
	btrfs_node_key(c, &disk_key, mid);
3462

3463 3464
	split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
					     c->start, 0);
3465 3466 3467
	if (IS_ERR(split))
		return PTR_ERR(split);

3468
	root_add_used(root, fs_info->nodesize);
3469
	ASSERT(btrfs_header_level(c) == level);
3470

3471
	ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3472
	if (ret) {
3473
		btrfs_abort_transaction(trans, ret);
3474 3475
		return ret;
	}
3476 3477 3478 3479 3480 3481
	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
3482 3483
	ret = 0;

3484 3485 3486
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

3487
	insert_ptr(trans, path, &disk_key, split->start,
3488
		   path->slots[level + 1] + 1, level + 1);
C
Chris Mason 已提交
3489

C
Chris Mason 已提交
3490
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
3491
		path->slots[level] -= mid;
3492
		btrfs_tree_unlock(c);
3493 3494
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
3495 3496
		path->slots[level + 1] += 1;
	} else {
3497
		btrfs_tree_unlock(split);
3498
		free_extent_buffer(split);
3499
	}
C
Chris Mason 已提交
3500
	return ret;
3501 3502
}

C
Chris Mason 已提交
3503 3504 3505 3506 3507
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
3508
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3509
{
J
Josef Bacik 已提交
3510 3511
	struct btrfs_item *start_item;
	struct btrfs_item *end_item;
3512
	int data_len;
3513
	int nritems = btrfs_header_nritems(l);
3514
	int end = min(nritems, start + nr) - 1;
3515 3516 3517

	if (!nr)
		return 0;
3518 3519
	start_item = btrfs_item_nr(start);
	end_item = btrfs_item_nr(end);
3520 3521 3522
	data_len = btrfs_item_offset(l, start_item) +
		   btrfs_item_size(l, start_item);
	data_len = data_len - btrfs_item_offset(l, end_item);
C
Chris Mason 已提交
3523
	data_len += sizeof(struct btrfs_item) * nr;
3524
	WARN_ON(data_len < 0);
3525 3526 3527
	return data_len;
}

3528 3529 3530 3531 3532
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
3533
noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3534
{
3535
	struct btrfs_fs_info *fs_info = leaf->fs_info;
3536 3537
	int nritems = btrfs_header_nritems(leaf);
	int ret;
3538 3539

	ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3540
	if (ret < 0) {
3541 3542 3543 3544 3545
		btrfs_crit(fs_info,
			   "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
			   ret,
			   (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
			   leaf_space_used(leaf, 0, nritems), nritems);
3546 3547
	}
	return ret;
3548 3549
}

3550 3551 3552 3553
/*
 * min slot controls the lowest index we're willing to push to the
 * right.  We'll push up to and including min_slot, but no lower
 */
3554
static noinline int __push_leaf_right(struct btrfs_path *path,
3555 3556
				      int data_size, int empty,
				      struct extent_buffer *right,
3557 3558
				      int free_space, u32 left_nritems,
				      u32 min_slot)
C
Chris Mason 已提交
3559
{
3560
	struct btrfs_fs_info *fs_info = right->fs_info;
3561
	struct extent_buffer *left = path->nodes[0];
3562
	struct extent_buffer *upper = path->nodes[1];
3563
	struct btrfs_map_token token;
3564
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
3565
	int slot;
3566
	u32 i;
C
Chris Mason 已提交
3567 3568
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
3569
	struct btrfs_item *item;
3570
	u32 nr;
3571
	u32 right_nritems;
3572
	u32 data_end;
3573
	u32 this_item_size;
C
Chris Mason 已提交
3574

3575 3576 3577
	if (empty)
		nr = 0;
	else
3578
		nr = max_t(u32, 1, min_slot);
3579

Z
Zheng Yan 已提交
3580
	if (path->slots[0] >= left_nritems)
3581
		push_space += data_size;
Z
Zheng Yan 已提交
3582

3583
	slot = path->slots[1];
3584 3585
	i = left_nritems - 1;
	while (i >= nr) {
3586
		item = btrfs_item_nr(i);
3587

Z
Zheng Yan 已提交
3588 3589 3590 3591
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
3592 3593
				int space = btrfs_leaf_free_space(left);

Z
Zheng Yan 已提交
3594 3595 3596 3597 3598
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
3599
		if (path->slots[0] == i)
3600
			push_space += data_size;
3601 3602 3603

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
3604
			break;
Z
Zheng Yan 已提交
3605

C
Chris Mason 已提交
3606
		push_items++;
3607
		push_space += this_item_size + sizeof(*item);
3608 3609 3610
		if (i == 0)
			break;
		i--;
3611
	}
3612

3613 3614
	if (push_items == 0)
		goto out_unlock;
3615

J
Julia Lawall 已提交
3616
	WARN_ON(!empty && push_items == left_nritems);
3617

C
Chris Mason 已提交
3618
	/* push left to right */
3619
	right_nritems = btrfs_header_nritems(right);
3620

3621
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3622
	push_space -= leaf_data_end(left);
3623

C
Chris Mason 已提交
3624
	/* make room in the right data area */
3625
	data_end = leaf_data_end(right);
3626
	memmove_extent_buffer(right,
3627 3628
			      BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
			      BTRFS_LEAF_DATA_OFFSET + data_end,
3629
			      BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3630

C
Chris Mason 已提交
3631
	/* copy from the left data area */
3632
	copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3633
		     BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3634
		     BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
C
Chris Mason 已提交
3635
		     push_space);
3636 3637 3638 3639 3640

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
3641
	/* copy the items from left to right */
3642 3643 3644
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
3645 3646

	/* update the item pointers */
3647
	btrfs_init_map_token(&token, right);
3648
	right_nritems += push_items;
3649
	btrfs_set_header_nritems(right, right_nritems);
3650
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3651
	for (i = 0; i < right_nritems; i++) {
3652
		item = btrfs_item_nr(i);
3653 3654
		push_space -= btrfs_token_item_size(&token, item);
		btrfs_set_token_item_offset(&token, item, push_space);
3655 3656
	}

3657
	left_nritems -= push_items;
3658
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
3659

3660 3661
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
3662
	else
3663
		btrfs_clean_tree_block(left);
3664

3665
	btrfs_mark_buffer_dirty(right);
3666

3667 3668
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
3669
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
3670

C
Chris Mason 已提交
3671
	/* then fixup the leaf pointer in the path */
3672 3673
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
3674
		if (btrfs_header_nritems(path->nodes[0]) == 0)
3675
			btrfs_clean_tree_block(path->nodes[0]);
3676
		btrfs_tree_unlock(path->nodes[0]);
3677 3678
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
3679 3680
		path->slots[1] += 1;
	} else {
3681
		btrfs_tree_unlock(right);
3682
		free_extent_buffer(right);
C
Chris Mason 已提交
3683 3684
	}
	return 0;
3685 3686 3687 3688 3689

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
3690
}
3691

3692 3693 3694 3695 3696 3697
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
3698 3699 3700
 *
 * this will push starting from min_slot to the end of the leaf.  It won't
 * push any slot lower than min_slot
3701 3702
 */
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3703 3704 3705
			   *root, struct btrfs_path *path,
			   int min_data_size, int data_size,
			   int empty, u32 min_slot)
3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724
{
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	int slot;
	int free_space;
	u32 left_nritems;
	int ret;

	if (!path->nodes[1])
		return 1;

	slot = path->slots[1];
	upper = path->nodes[1];
	if (slot >= btrfs_header_nritems(upper) - 1)
		return 1;

	btrfs_assert_tree_locked(path->nodes[1]);

3725
	right = btrfs_read_node_slot(upper, slot + 1);
3726 3727 3728 3729 3730
	/*
	 * slot + 1 is not valid or we fail to read the right node,
	 * no big deal, just return.
	 */
	if (IS_ERR(right))
T
Tsutomu Itoh 已提交
3731 3732
		return 1;

3733
	btrfs_tree_lock(right);
3734
	btrfs_set_lock_blocking_write(right);
3735

3736
	free_space = btrfs_leaf_free_space(right);
3737 3738 3739 3740 3741 3742 3743 3744 3745
	if (free_space < data_size)
		goto out_unlock;

	/* cow and double check */
	ret = btrfs_cow_block(trans, root, right, upper,
			      slot + 1, &right);
	if (ret)
		goto out_unlock;

3746
	free_space = btrfs_leaf_free_space(right);
3747 3748 3749 3750 3751 3752 3753
	if (free_space < data_size)
		goto out_unlock;

	left_nritems = btrfs_header_nritems(left);
	if (left_nritems == 0)
		goto out_unlock;

3754 3755 3756 3757
	if (path->slots[0] == left_nritems && !empty) {
		/* Key greater than all keys in the leaf, right neighbor has
		 * enough room for it and we're not emptying our leaf to delete
		 * it, therefore use right neighbor to insert the new item and
3758
		 * no need to touch/dirty our left leaf. */
3759 3760 3761 3762 3763 3764 3765 3766
		btrfs_tree_unlock(left);
		free_extent_buffer(left);
		path->nodes[0] = right;
		path->slots[0] = 0;
		path->slots[1]++;
		return 0;
	}

3767
	return __push_leaf_right(path, min_data_size, empty,
3768
				right, free_space, left_nritems, min_slot);
3769 3770 3771 3772 3773 3774
out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
}

C
Chris Mason 已提交
3775 3776 3777
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3778 3779 3780 3781
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
 * items
C
Chris Mason 已提交
3782
 */
3783
static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3784
				     int empty, struct extent_buffer *left,
3785 3786
				     int free_space, u32 right_nritems,
				     u32 max_slot)
3787
{
3788
	struct btrfs_fs_info *fs_info = left->fs_info;
3789 3790
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
3791 3792 3793
	int i;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
3794
	struct btrfs_item *item;
3795
	u32 old_left_nritems;
3796
	u32 nr;
C
Chris Mason 已提交
3797
	int ret = 0;
3798 3799
	u32 this_item_size;
	u32 old_left_item_size;
3800 3801
	struct btrfs_map_token token;

3802
	if (empty)
3803
		nr = min(right_nritems, max_slot);
3804
	else
3805
		nr = min(right_nritems - 1, max_slot);
3806 3807

	for (i = 0; i < nr; i++) {
3808
		item = btrfs_item_nr(i);
3809

Z
Zheng Yan 已提交
3810 3811 3812 3813
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
3814 3815
				int space = btrfs_leaf_free_space(right);

Z
Zheng Yan 已提交
3816 3817 3818 3819 3820
				if (space + push_space * 2 > free_space)
					break;
			}
		}

3821
		if (path->slots[0] == i)
3822
			push_space += data_size;
3823 3824 3825

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
3826
			break;
3827

3828
		push_items++;
3829 3830 3831
		push_space += this_item_size + sizeof(*item);
	}

3832
	if (push_items == 0) {
3833 3834
		ret = 1;
		goto out;
3835
	}
3836
	WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3837

3838
	/* push data from right to left */
3839 3840 3841 3842 3843
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

3844
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
C
Chris Mason 已提交
3845
		     btrfs_item_offset_nr(right, push_items - 1);
3846

3847
	copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3848
		     leaf_data_end(left) - push_space,
3849
		     BTRFS_LEAF_DATA_OFFSET +
3850
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
3851
		     push_space);
3852
	old_left_nritems = btrfs_header_nritems(left);
3853
	BUG_ON(old_left_nritems <= 0);
3854

3855
	btrfs_init_map_token(&token, left);
3856
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
3857
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3858
		u32 ioff;
3859

3860
		item = btrfs_item_nr(i);
3861

3862 3863 3864
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item,
		      ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3865
	}
3866
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
3867 3868

	/* fixup right node */
J
Julia Lawall 已提交
3869 3870
	if (push_items > right_nritems)
		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
C
Chris Mason 已提交
3871
		       right_nritems);
3872 3873 3874

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
3875
						  leaf_data_end(right);
3876
		memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
3877
				      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3878
				      BTRFS_LEAF_DATA_OFFSET +
3879
				      leaf_data_end(right), push_space);
3880 3881

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3882 3883 3884
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
3885
	}
3886 3887

	btrfs_init_map_token(&token, right);
3888 3889
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
3890
	push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3891
	for (i = 0; i < right_nritems; i++) {
3892
		item = btrfs_item_nr(i);
3893

3894 3895
		push_space = push_space - btrfs_token_item_size(&token, item);
		btrfs_set_token_item_offset(&token, item, push_space);
3896
	}
3897

3898
	btrfs_mark_buffer_dirty(left);
3899 3900
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
3901
	else
3902
		btrfs_clean_tree_block(right);
3903

3904
	btrfs_item_key(right, &disk_key, 0);
3905
	fixup_low_keys(path, &disk_key, 1);
3906 3907 3908 3909

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
3910
		btrfs_tree_unlock(path->nodes[0]);
3911 3912
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
3913 3914
		path->slots[1] -= 1;
	} else {
3915
		btrfs_tree_unlock(left);
3916
		free_extent_buffer(left);
3917 3918
		path->slots[0] -= push_items;
	}
3919
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
3920
	return ret;
3921 3922 3923 3924
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
3925 3926
}

3927 3928 3929
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3930 3931 3932 3933
 *
 * max_slot can put a limit on how far into the leaf we'll push items.  The
 * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
 * items
3934 3935
 */
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3936 3937
			  *root, struct btrfs_path *path, int min_data_size,
			  int data_size, int empty, u32 max_slot)
3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957
{
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
	int slot;
	int free_space;
	u32 right_nritems;
	int ret = 0;

	slot = path->slots[1];
	if (slot == 0)
		return 1;
	if (!path->nodes[1])
		return 1;

	right_nritems = btrfs_header_nritems(right);
	if (right_nritems == 0)
		return 1;

	btrfs_assert_tree_locked(path->nodes[1]);

3958
	left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3959 3960 3961 3962 3963
	/*
	 * slot - 1 is not valid or we fail to read the left node,
	 * no big deal, just return.
	 */
	if (IS_ERR(left))
T
Tsutomu Itoh 已提交
3964 3965
		return 1;

3966
	btrfs_tree_lock(left);
3967
	btrfs_set_lock_blocking_write(left);
3968

3969
	free_space = btrfs_leaf_free_space(left);
3970 3971 3972 3973 3974 3975 3976 3977 3978 3979
	if (free_space < data_size) {
		ret = 1;
		goto out;
	}

	/* cow and double check */
	ret = btrfs_cow_block(trans, root, left,
			      path->nodes[1], slot - 1, &left);
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
3980 3981
		if (ret == -ENOSPC)
			ret = 1;
3982 3983 3984
		goto out;
	}

3985
	free_space = btrfs_leaf_free_space(left);
3986 3987 3988 3989 3990
	if (free_space < data_size) {
		ret = 1;
		goto out;
	}

3991
	return __push_leaf_left(path, min_data_size,
3992 3993
			       empty, left, free_space, right_nritems,
			       max_slot);
3994 3995 3996 3997 3998 3999 4000 4001 4002 4003
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
}

/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 */
4004 4005 4006 4007 4008
static noinline void copy_for_split(struct btrfs_trans_handle *trans,
				    struct btrfs_path *path,
				    struct extent_buffer *l,
				    struct extent_buffer *right,
				    int slot, int mid, int nritems)
4009
{
4010
	struct btrfs_fs_info *fs_info = trans->fs_info;
4011 4012 4013 4014
	int data_copy_size;
	int rt_data_off;
	int i;
	struct btrfs_disk_key disk_key;
4015 4016
	struct btrfs_map_token token;

4017 4018
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
4019
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4020 4021 4022 4023 4024 4025

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
4026 4027
		     BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
		     data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4028
		     leaf_data_end(l), data_copy_size);
4029

4030
	rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4031

4032
	btrfs_init_map_token(&token, right);
4033
	for (i = 0; i < nritems; i++) {
4034
		struct btrfs_item *item = btrfs_item_nr(i);
4035 4036
		u32 ioff;

4037 4038
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4039 4040 4041 4042
	}

	btrfs_set_header_nritems(l, mid);
	btrfs_item_key(right, &disk_key, 0);
4043
	insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
	BUG_ON(path->slots[0] != slot);

	if (mid <= slot) {
		btrfs_tree_unlock(path->nodes[0]);
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
		path->slots[0] -= mid;
		path->slots[1] += 1;
	} else {
		btrfs_tree_unlock(right);
		free_extent_buffer(right);
	}

	BUG_ON(path->slots[0] < 0);
}

4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081
/*
 * double splits happen when we need to insert a big item in the middle
 * of a leaf.  A double split can leave us with 3 mostly empty leaves:
 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
 *          A                 B                 C
 *
 * We avoid this by trying to push the items on either side of our target
 * into the adjacent leaves.  If all goes well we can avoid the double split
 * completely.
 */
static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path,
					  int data_size)
{
	int ret;
	int progress = 0;
	int slot;
	u32 nritems;
4082
	int space_needed = data_size;
4083 4084

	slot = path->slots[0];
4085
	if (slot < btrfs_header_nritems(path->nodes[0]))
4086
		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4087 4088 4089 4090 4091

	/*
	 * try to push all the items after our slot into the
	 * right leaf
	 */
4092
	ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106
	if (ret < 0)
		return ret;

	if (ret == 0)
		progress++;

	nritems = btrfs_header_nritems(path->nodes[0]);
	/*
	 * our goal is to get our slot at the start or end of a leaf.  If
	 * we've done so we're done
	 */
	if (path->slots[0] == 0 || path->slots[0] == nritems)
		return 0;

4107
	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4108 4109 4110 4111
		return 0;

	/* try to push all the items before our slot into the next leaf */
	slot = path->slots[0];
4112 4113
	space_needed = data_size;
	if (slot > 0)
4114
		space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4115
	ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126
	if (ret < 0)
		return ret;

	if (ret == 0)
		progress++;

	if (progress)
		return 0;
	return 1;
}

C
Chris Mason 已提交
4127 4128 4129
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
4130 4131
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
4132
 */
4133 4134
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
4135
			       const struct btrfs_key *ins_key,
4136 4137
			       struct btrfs_path *path, int data_size,
			       int extend)
4138
{
4139
	struct btrfs_disk_key disk_key;
4140
	struct extent_buffer *l;
4141
	u32 nritems;
4142 4143
	int mid;
	int slot;
4144
	struct extent_buffer *right;
4145
	struct btrfs_fs_info *fs_info = root->fs_info;
4146
	int ret = 0;
C
Chris Mason 已提交
4147
	int wret;
4148
	int split;
4149
	int num_doubles = 0;
4150
	int tried_avoid_double = 0;
C
Chris Mason 已提交
4151

4152 4153 4154
	l = path->nodes[0];
	slot = path->slots[0];
	if (extend && data_size + btrfs_item_size_nr(l, slot) +
4155
	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4156 4157
		return -EOVERFLOW;

C
Chris Mason 已提交
4158
	/* first try to make some room by pushing left and right */
4159
	if (data_size && path->nodes[1]) {
4160 4161 4162
		int space_needed = data_size;

		if (slot < btrfs_header_nritems(l))
4163
			space_needed -= btrfs_leaf_free_space(l);
4164 4165 4166

		wret = push_leaf_right(trans, root, path, space_needed,
				       space_needed, 0, 0);
C
Chris Mason 已提交
4167
		if (wret < 0)
C
Chris Mason 已提交
4168
			return wret;
4169
		if (wret) {
4170 4171
			space_needed = data_size;
			if (slot > 0)
4172
				space_needed -= btrfs_leaf_free_space(l);
4173 4174
			wret = push_leaf_left(trans, root, path, space_needed,
					      space_needed, 0, (u32)-1);
4175 4176 4177 4178
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
4179

4180
		/* did the pushes work? */
4181
		if (btrfs_leaf_free_space(l) >= data_size)
4182
			return 0;
4183
	}
C
Chris Mason 已提交
4184

C
Chris Mason 已提交
4185
	if (!path->nodes[1]) {
4186
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
4187 4188 4189
		if (ret)
			return ret;
	}
4190
again:
4191
	split = 1;
4192
	l = path->nodes[0];
4193
	slot = path->slots[0];
4194
	nritems = btrfs_header_nritems(l);
C
Chris Mason 已提交
4195
	mid = (nritems + 1) / 2;
4196

4197 4198 4199
	if (mid <= slot) {
		if (nritems == 1 ||
		    leaf_space_used(l, mid, nritems - mid) + data_size >
4200
			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4201 4202 4203 4204 4205 4206
			if (slot >= nritems) {
				split = 0;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
4207
				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4208 4209
					if (data_size && !tried_avoid_double)
						goto push_for_double;
4210 4211 4212 4213 4214 4215
					split = 2;
				}
			}
		}
	} else {
		if (leaf_space_used(l, 0, mid) + data_size >
4216
			BTRFS_LEAF_DATA_SIZE(fs_info)) {
4217 4218 4219 4220 4221 4222 4223 4224
			if (!extend && data_size && slot == 0) {
				split = 0;
			} else if ((extend || !data_size) && slot == 0) {
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
4225
				    data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4226 4227
					if (data_size && !tried_avoid_double)
						goto push_for_double;
4228
					split = 2;
4229 4230 4231 4232 4233 4234 4235 4236 4237 4238
				}
			}
		}
	}

	if (split == 0)
		btrfs_cpu_key_to_disk(&disk_key, ins_key);
	else
		btrfs_item_key(l, &disk_key, mid);

4239 4240
	right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
					     l->start, 0);
4241
	if (IS_ERR(right))
4242
		return PTR_ERR(right);
4243

4244
	root_add_used(root, fs_info->nodesize);
4245

4246 4247 4248
	if (split == 0) {
		if (mid <= slot) {
			btrfs_set_header_nritems(right, 0);
4249
			insert_ptr(trans, path, &disk_key,
4250
				   right->start, path->slots[1] + 1, 1);
4251 4252 4253 4254 4255 4256 4257
			btrfs_tree_unlock(path->nodes[0]);
			free_extent_buffer(path->nodes[0]);
			path->nodes[0] = right;
			path->slots[0] = 0;
			path->slots[1] += 1;
		} else {
			btrfs_set_header_nritems(right, 0);
4258
			insert_ptr(trans, path, &disk_key,
4259
				   right->start, path->slots[1], 1);
4260 4261 4262 4263
			btrfs_tree_unlock(path->nodes[0]);
			free_extent_buffer(path->nodes[0]);
			path->nodes[0] = right;
			path->slots[0] = 0;
4264
			if (path->slots[1] == 0)
4265
				fixup_low_keys(path, &disk_key, 1);
4266
		}
4267 4268 4269 4270 4271
		/*
		 * We create a new leaf 'right' for the required ins_len and
		 * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
		 * the content of ins_len to 'right'.
		 */
4272
		return ret;
4273
	}
C
Chris Mason 已提交
4274

4275
	copy_for_split(trans, path, l, right, slot, mid, nritems);
Z
Zheng Yan 已提交
4276

4277
	if (split == 2) {
4278 4279 4280
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
4281
	}
4282

4283
	return 0;
4284 4285 4286 4287

push_for_double:
	push_for_double_split(trans, root, path, data_size);
	tried_avoid_double = 1;
4288
	if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4289 4290
		return 0;
	goto again;
4291 4292
}

Y
Yan, Zheng 已提交
4293 4294 4295
static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
					 struct btrfs_root *root,
					 struct btrfs_path *path, int ins_len)
4296
{
Y
Yan, Zheng 已提交
4297
	struct btrfs_key key;
4298
	struct extent_buffer *leaf;
Y
Yan, Zheng 已提交
4299 4300 4301 4302
	struct btrfs_file_extent_item *fi;
	u64 extent_len = 0;
	u32 item_size;
	int ret;
4303 4304

	leaf = path->nodes[0];
Y
Yan, Zheng 已提交
4305 4306 4307 4308 4309
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);

	BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
	       key.type != BTRFS_EXTENT_CSUM_KEY);

4310
	if (btrfs_leaf_free_space(leaf) >= ins_len)
Y
Yan, Zheng 已提交
4311
		return 0;
4312 4313

	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
Y
Yan, Zheng 已提交
4314 4315 4316 4317 4318
	if (key.type == BTRFS_EXTENT_DATA_KEY) {
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
	}
4319
	btrfs_release_path(path);
4320 4321

	path->keep_locks = 1;
Y
Yan, Zheng 已提交
4322 4323
	path->search_for_split = 1;
	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4324
	path->search_for_split = 0;
4325 4326
	if (ret > 0)
		ret = -EAGAIN;
Y
Yan, Zheng 已提交
4327 4328
	if (ret < 0)
		goto err;
4329

Y
Yan, Zheng 已提交
4330 4331
	ret = -EAGAIN;
	leaf = path->nodes[0];
4332 4333
	/* if our item isn't there, return now */
	if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
Y
Yan, Zheng 已提交
4334 4335
		goto err;

4336
	/* the leaf has  changed, it now has room.  return now */
4337
	if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4338 4339
		goto err;

Y
Yan, Zheng 已提交
4340 4341 4342 4343 4344
	if (key.type == BTRFS_EXTENT_DATA_KEY) {
		fi = btrfs_item_ptr(leaf, path->slots[0],
				    struct btrfs_file_extent_item);
		if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
			goto err;
4345 4346
	}

4347
	btrfs_set_path_blocking(path);
Y
Yan, Zheng 已提交
4348
	ret = split_leaf(trans, root, &key, path, ins_len, 1);
4349 4350
	if (ret)
		goto err;
4351

Y
Yan, Zheng 已提交
4352
	path->keep_locks = 0;
4353
	btrfs_unlock_up_safe(path, 1);
Y
Yan, Zheng 已提交
4354 4355 4356 4357 4358 4359
	return 0;
err:
	path->keep_locks = 0;
	return ret;
}

4360
static noinline int split_item(struct btrfs_path *path,
4361
			       const struct btrfs_key *new_key,
Y
Yan, Zheng 已提交
4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373
			       unsigned long split_offset)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	struct btrfs_item *new_item;
	int slot;
	char *buf;
	u32 nritems;
	u32 item_size;
	u32 orig_offset;
	struct btrfs_disk_key disk_key;

4374
	leaf = path->nodes[0];
4375
	BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4376

4377 4378
	btrfs_set_path_blocking(path);

4379
	item = btrfs_item_nr(path->slots[0]);
4380 4381 4382 4383
	orig_offset = btrfs_item_offset(leaf, item);
	item_size = btrfs_item_size(leaf, item);

	buf = kmalloc(item_size, GFP_NOFS);
Y
Yan, Zheng 已提交
4384 4385 4386
	if (!buf)
		return -ENOMEM;

4387 4388 4389
	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
			    path->slots[0]), item_size);

Y
Yan, Zheng 已提交
4390
	slot = path->slots[0] + 1;
4391 4392 4393 4394
	nritems = btrfs_header_nritems(leaf);
	if (slot != nritems) {
		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
Y
Yan, Zheng 已提交
4395 4396
				btrfs_item_nr_offset(slot),
				(nritems - slot) * sizeof(struct btrfs_item));
4397 4398 4399 4400 4401
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(leaf, &disk_key, slot);

4402
	new_item = btrfs_item_nr(slot);
4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423

	btrfs_set_item_offset(leaf, new_item, orig_offset);
	btrfs_set_item_size(leaf, new_item, item_size - split_offset);

	btrfs_set_item_offset(leaf, item,
			      orig_offset + item_size - split_offset);
	btrfs_set_item_size(leaf, item, split_offset);

	btrfs_set_header_nritems(leaf, nritems + 1);

	/* write the data for the start of the original item */
	write_extent_buffer(leaf, buf,
			    btrfs_item_ptr_offset(leaf, path->slots[0]),
			    split_offset);

	/* write the data for the new item */
	write_extent_buffer(leaf, buf + split_offset,
			    btrfs_item_ptr_offset(leaf, slot),
			    item_size - split_offset);
	btrfs_mark_buffer_dirty(leaf);

4424
	BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4425
	kfree(buf);
Y
Yan, Zheng 已提交
4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446
	return 0;
}

/*
 * This function splits a single item into two items,
 * giving 'new_key' to the new item and splitting the
 * old one at split_offset (from the start of the item).
 *
 * The path may be released by this operation.  After
 * the split, the path is pointing to the old item.  The
 * new item is going to be in the same node as the old one.
 *
 * Note, the item being split must be smaller enough to live alone on
 * a tree block with room for one extra struct btrfs_item
 *
 * This allows us to split the item in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_split_item(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_path *path,
4447
		     const struct btrfs_key *new_key,
Y
Yan, Zheng 已提交
4448 4449 4450 4451 4452 4453 4454 4455
		     unsigned long split_offset)
{
	int ret;
	ret = setup_leaf_for_split(trans, root, path,
				   sizeof(struct btrfs_item));
	if (ret)
		return ret;

4456
	ret = split_item(path, new_key, split_offset);
4457 4458 4459
	return ret;
}

Y
Yan, Zheng 已提交
4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470
/*
 * This function duplicate a item, giving 'new_key' to the new item.
 * It guarantees both items live in the same tree leaf and the new item
 * is contiguous with the original item.
 *
 * This allows us to split file extent in place, keeping a lock on the
 * leaf the entire time.
 */
int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
			 struct btrfs_root *root,
			 struct btrfs_path *path,
4471
			 const struct btrfs_key *new_key)
Y
Yan, Zheng 已提交
4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484
{
	struct extent_buffer *leaf;
	int ret;
	u32 item_size;

	leaf = path->nodes[0];
	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
	ret = setup_leaf_for_split(trans, root, path,
				   item_size + sizeof(struct btrfs_item));
	if (ret)
		return ret;

	path->slots[0]++;
4485
	setup_items_for_insert(root, path, new_key, &item_size,
4486 4487
			       item_size, item_size +
			       sizeof(struct btrfs_item), 1);
Y
Yan, Zheng 已提交
4488 4489 4490 4491 4492 4493 4494 4495
	leaf = path->nodes[0];
	memcpy_extent_buffer(leaf,
			     btrfs_item_ptr_offset(leaf, path->slots[0]),
			     btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
			     item_size);
	return 0;
}

C
Chris Mason 已提交
4496 4497 4498 4499 4500 4501
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
4502
void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
C
Chris Mason 已提交
4503 4504
{
	int slot;
4505 4506
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
4507 4508 4509 4510 4511 4512
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;
4513 4514
	struct btrfs_map_token token;

4515
	leaf = path->nodes[0];
4516 4517 4518 4519
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
4520
		return;
C
Chris Mason 已提交
4521

4522
	nritems = btrfs_header_nritems(leaf);
4523
	data_end = leaf_data_end(leaf);
C
Chris Mason 已提交
4524

4525
	old_data_start = btrfs_item_offset_nr(leaf, slot);
4526

C
Chris Mason 已提交
4527 4528 4529 4530 4531 4532 4533 4534 4535
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
4536
	btrfs_init_map_token(&token, leaf);
C
Chris Mason 已提交
4537
	for (i = slot; i < nritems; i++) {
4538
		u32 ioff;
4539
		item = btrfs_item_nr(i);
4540

4541 4542
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff + size_diff);
C
Chris Mason 已提交
4543
	}
4544

C
Chris Mason 已提交
4545
	/* shift the data */
4546
	if (from_end) {
4547 4548
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
C
Chris Mason 已提交
4569
				      (unsigned long)fi,
4570
				      BTRFS_FILE_EXTENT_INLINE_DATA_START);
4571 4572 4573
			}
		}

4574 4575
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4576 4577 4578 4579 4580 4581
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
4582
			fixup_low_keys(path, &disk_key, 1);
4583
	}
4584

4585
	item = btrfs_item_nr(slot);
4586 4587
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
4588

4589
	if (btrfs_leaf_free_space(leaf) < 0) {
4590
		btrfs_print_leaf(leaf);
C
Chris Mason 已提交
4591
		BUG();
4592
	}
C
Chris Mason 已提交
4593 4594
}

C
Chris Mason 已提交
4595
/*
S
Stefan Behrens 已提交
4596
 * make the item pointed to by the path bigger, data_size is the added size.
C
Chris Mason 已提交
4597
 */
4598
void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4599 4600
{
	int slot;
4601 4602
	struct extent_buffer *leaf;
	struct btrfs_item *item;
4603 4604 4605 4606 4607
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;
4608 4609
	struct btrfs_map_token token;

4610
	leaf = path->nodes[0];
4611

4612
	nritems = btrfs_header_nritems(leaf);
4613
	data_end = leaf_data_end(leaf);
4614

4615
	if (btrfs_leaf_free_space(leaf) < data_size) {
4616
		btrfs_print_leaf(leaf);
4617
		BUG();
4618
	}
4619
	slot = path->slots[0];
4620
	old_data = btrfs_item_end_nr(leaf, slot);
4621 4622

	BUG_ON(slot < 0);
4623
	if (slot >= nritems) {
4624
		btrfs_print_leaf(leaf);
4625
		btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4626
			   slot, nritems);
4627
		BUG();
4628
	}
4629 4630 4631 4632 4633

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
4634
	btrfs_init_map_token(&token, leaf);
4635
	for (i = slot; i < nritems; i++) {
4636
		u32 ioff;
4637
		item = btrfs_item_nr(i);
4638

4639 4640
		ioff = btrfs_token_item_offset(&token, item);
		btrfs_set_token_item_offset(&token, item, ioff - data_size);
4641
	}
4642

4643
	/* shift the data */
4644 4645
	memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
		      data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4646
		      data_end, old_data - data_end);
4647

4648
	data_end = old_data;
4649
	old_size = btrfs_item_size_nr(leaf, slot);
4650
	item = btrfs_item_nr(slot);
4651 4652
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
4653

4654
	if (btrfs_leaf_free_space(leaf) < 0) {
4655
		btrfs_print_leaf(leaf);
4656
		BUG();
4657
	}
4658 4659
}

C
Chris Mason 已提交
4660
/*
4661 4662 4663
 * this is a helper for btrfs_insert_empty_items, the main goal here is
 * to save stack depth by doing the bulk of the work in a function
 * that doesn't call btrfs_search_slot
C
Chris Mason 已提交
4664
 */
4665
void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4666
			    const struct btrfs_key *cpu_key, u32 *data_size,
4667
			    u32 total_data, u32 total_size, int nr)
4668
{
4669
	struct btrfs_fs_info *fs_info = root->fs_info;
4670
	struct btrfs_item *item;
4671
	int i;
4672
	u32 nritems;
4673
	unsigned int data_end;
C
Chris Mason 已提交
4674
	struct btrfs_disk_key disk_key;
4675 4676
	struct extent_buffer *leaf;
	int slot;
4677 4678
	struct btrfs_map_token token;

4679 4680
	if (path->slots[0] == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4681
		fixup_low_keys(path, &disk_key, 1);
4682 4683 4684
	}
	btrfs_unlock_up_safe(path, 1);

4685
	leaf = path->nodes[0];
4686
	slot = path->slots[0];
C
Chris Mason 已提交
4687

4688
	nritems = btrfs_header_nritems(leaf);
4689
	data_end = leaf_data_end(leaf);
4690

4691
	if (btrfs_leaf_free_space(leaf) < total_size) {
4692
		btrfs_print_leaf(leaf);
4693
		btrfs_crit(fs_info, "not enough freespace need %u have %d",
4694
			   total_size, btrfs_leaf_free_space(leaf));
4695
		BUG();
4696
	}
4697

4698
	btrfs_init_map_token(&token, leaf);
4699
	if (slot != nritems) {
4700
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4701

4702
		if (old_data < data_end) {
4703
			btrfs_print_leaf(leaf);
4704
			btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
J
Jeff Mahoney 已提交
4705
				   slot, old_data, data_end);
4706
			BUG();
4707
		}
4708 4709 4710 4711
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
C
Chris Mason 已提交
4712
		for (i = slot; i < nritems; i++) {
4713
			u32 ioff;
4714

4715
			item = btrfs_item_nr(i);
4716 4717 4718
			ioff = btrfs_token_item_offset(&token, item);
			btrfs_set_token_item_offset(&token, item,
						    ioff - total_data);
C
Chris Mason 已提交
4719
		}
4720
		/* shift the items */
4721
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4722
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
4723
			      (nritems - slot) * sizeof(struct btrfs_item));
4724 4725

		/* shift the data */
4726 4727
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
			      data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
C
Chris Mason 已提交
4728
			      data_end, old_data - data_end);
4729 4730
		data_end = old_data;
	}
4731

4732
	/* setup the item for the new data */
4733 4734 4735
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
4736
		item = btrfs_item_nr(slot + i);
4737
		btrfs_set_token_item_offset(&token, item, data_end - data_size[i]);
4738
		data_end -= data_size[i];
4739
		btrfs_set_token_item_size(&token, item, data_size[i]);
4740
	}
4741

4742
	btrfs_set_header_nritems(leaf, nritems + nr);
4743
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
4744

4745
	if (btrfs_leaf_free_space(leaf) < 0) {
4746
		btrfs_print_leaf(leaf);
4747
		BUG();
4748
	}
4749 4750 4751 4752 4753 4754 4755 4756 4757
}

/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
4758
			    const struct btrfs_key *cpu_key, u32 *data_size,
4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774
			    int nr)
{
	int ret = 0;
	int slot;
	int i;
	u32 total_size = 0;
	u32 total_data = 0;

	for (i = 0; i < nr; i++)
		total_data += data_size[i];

	total_size = total_data + (nr * sizeof(struct btrfs_item));
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
4775
		return ret;
4776 4777 4778 4779

	slot = path->slots[0];
	BUG_ON(slot < 0);

4780
	setup_items_for_insert(root, path, cpu_key, data_size,
4781
			       total_data, total_size, nr);
4782
	return 0;
4783 4784 4785 4786 4787 4788
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
4789 4790 4791
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		      const struct btrfs_key *cpu_key, void *data,
		      u32 data_size)
4792 4793
{
	int ret = 0;
C
Chris Mason 已提交
4794
	struct btrfs_path *path;
4795 4796
	struct extent_buffer *leaf;
	unsigned long ptr;
4797

C
Chris Mason 已提交
4798
	path = btrfs_alloc_path();
T
Tsutomu Itoh 已提交
4799 4800
	if (!path)
		return -ENOMEM;
C
Chris Mason 已提交
4801
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4802
	if (!ret) {
4803 4804 4805 4806
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
4807
	}
C
Chris Mason 已提交
4808
	btrfs_free_path(path);
C
Chris Mason 已提交
4809
	return ret;
4810 4811
}

C
Chris Mason 已提交
4812
/*
C
Chris Mason 已提交
4813
 * delete the pointer from a given node.
C
Chris Mason 已提交
4814
 *
C
Chris Mason 已提交
4815 4816
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
4817
 */
4818 4819
static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
		    int level, int slot)
4820
{
4821
	struct extent_buffer *parent = path->nodes[level];
4822
	u32 nritems;
4823
	int ret;
4824

4825
	nritems = btrfs_header_nritems(parent);
C
Chris Mason 已提交
4826
	if (slot != nritems - 1) {
4827 4828
		if (level) {
			ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4829
					nritems - slot - 1);
4830 4831
			BUG_ON(ret < 0);
		}
4832 4833 4834
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
4835 4836
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
4837
	} else if (level) {
4838 4839
		ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
				GFP_NOFS);
4840
		BUG_ON(ret < 0);
4841
	}
4842

4843
	nritems--;
4844
	btrfs_set_header_nritems(parent, nritems);
4845
	if (nritems == 0 && parent == root->node) {
4846
		BUG_ON(btrfs_header_level(root->node) != 1);
4847
		/* just turn the root into a leaf and break */
4848
		btrfs_set_header_level(root->node, 0);
4849
	} else if (slot == 0) {
4850 4851 4852
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
4853
		fixup_low_keys(path, &disk_key, level + 1);
4854
	}
C
Chris Mason 已提交
4855
	btrfs_mark_buffer_dirty(parent);
4856 4857
}

4858 4859
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
4860
 * path->nodes[1].
4861 4862 4863 4864 4865 4866 4867
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
4868 4869 4870 4871
static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
				    struct btrfs_root *root,
				    struct btrfs_path *path,
				    struct extent_buffer *leaf)
4872
{
4873
	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4874
	del_ptr(root, path, 1, path->slots[1]);
4875

4876 4877 4878 4879 4880 4881
	/*
	 * btrfs_free_extent is expensive, we want to make sure we
	 * aren't holding any locks when we call it
	 */
	btrfs_unlock_up_safe(path, 0);

4882 4883
	root_sub_used(root, leaf->len);

D
David Sterba 已提交
4884
	atomic_inc(&leaf->refs);
4885
	btrfs_free_tree_block(trans, root, leaf, 0, 1);
4886
	free_extent_buffer_stale(leaf);
4887
}
C
Chris Mason 已提交
4888 4889 4890 4891
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
4892 4893
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
4894
{
4895
	struct btrfs_fs_info *fs_info = root->fs_info;
4896 4897
	struct extent_buffer *leaf;
	struct btrfs_item *item;
4898 4899
	u32 last_off;
	u32 dsize = 0;
C
Chris Mason 已提交
4900 4901
	int ret = 0;
	int wret;
4902
	int i;
4903
	u32 nritems;
4904

4905
	leaf = path->nodes[0];
4906 4907 4908 4909 4910
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

4911
	nritems = btrfs_header_nritems(leaf);
4912

4913
	if (slot + nr != nritems) {
4914
		int data_end = leaf_data_end(leaf);
4915
		struct btrfs_map_token token;
4916

4917
		memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
C
Chris Mason 已提交
4918
			      data_end + dsize,
4919
			      BTRFS_LEAF_DATA_OFFSET + data_end,
4920
			      last_off - data_end);
4921

4922
		btrfs_init_map_token(&token, leaf);
4923
		for (i = slot + nr; i < nritems; i++) {
4924
			u32 ioff;
4925

4926
			item = btrfs_item_nr(i);
4927 4928
			ioff = btrfs_token_item_offset(&token, item);
			btrfs_set_token_item_offset(&token, item, ioff + dsize);
C
Chris Mason 已提交
4929
		}
4930

4931
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4932
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
4933
			      sizeof(struct btrfs_item) *
4934
			      (nritems - slot - nr));
4935
	}
4936 4937
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
4938

C
Chris Mason 已提交
4939
	/* delete the leaf if we've emptied it */
4940
	if (nritems == 0) {
4941 4942
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
4943
		} else {
4944
			btrfs_set_path_blocking(path);
4945
			btrfs_clean_tree_block(leaf);
4946
			btrfs_del_leaf(trans, root, path, leaf);
4947
		}
4948
	} else {
4949
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
4950
		if (slot == 0) {
4951 4952 4953
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
4954
			fixup_low_keys(path, &disk_key, 1);
C
Chris Mason 已提交
4955 4956
		}

C
Chris Mason 已提交
4957
		/* delete the leaf if it is mostly empty */
4958
		if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4959 4960 4961 4962
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
4963
			slot = path->slots[1];
D
David Sterba 已提交
4964
			atomic_inc(&leaf->refs);
4965

4966
			btrfs_set_path_blocking(path);
4967 4968
			wret = push_leaf_left(trans, root, path, 1, 1,
					      1, (u32)-1);
4969
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
4970
				ret = wret;
4971 4972 4973

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
4974 4975
				wret = push_leaf_right(trans, root, path, 1,
						       1, 1, 0);
4976
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
4977 4978
					ret = wret;
			}
4979 4980

			if (btrfs_header_nritems(leaf) == 0) {
4981
				path->slots[1] = slot;
4982
				btrfs_del_leaf(trans, root, path, leaf);
4983
				free_extent_buffer(leaf);
4984
				ret = 0;
C
Chris Mason 已提交
4985
			} else {
4986 4987 4988 4989 4990 4991 4992
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
4993
				free_extent_buffer(leaf);
4994
			}
4995
		} else {
4996
			btrfs_mark_buffer_dirty(leaf);
4997 4998
		}
	}
C
Chris Mason 已提交
4999
	return ret;
5000 5001
}

5002
/*
5003
 * search the tree again to find a leaf with lesser keys
5004 5005
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
5006 5007 5008
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
5009
 */
5010
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5011
{
5012 5013 5014
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
5015

5016
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5017

5018
	if (key.offset > 0) {
5019
		key.offset--;
5020
	} else if (key.type > 0) {
5021
		key.type--;
5022 5023
		key.offset = (u64)-1;
	} else if (key.objectid > 0) {
5024
		key.objectid--;
5025 5026 5027
		key.type = (u8)-1;
		key.offset = (u64)-1;
	} else {
5028
		return 1;
5029
	}
5030

5031
	btrfs_release_path(path);
5032 5033 5034 5035 5036
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047
	/*
	 * We might have had an item with the previous key in the tree right
	 * before we released our path. And after we released our path, that
	 * item might have been pushed to the first slot (0) of the leaf we
	 * were holding due to a tree balance. Alternatively, an item with the
	 * previous key can exist as the only element of a leaf (big fat item).
	 * Therefore account for these 2 cases, so that our callers (like
	 * btrfs_previous_item) don't miss an existing item with a key matching
	 * the previous key we computed above.
	 */
	if (ret <= 0)
5048 5049
		return 0;
	return 1;
5050 5051
}

5052 5053
/*
 * A helper function to walk down the tree starting at min_key, and looking
5054 5055
 * for nodes or leaves that are have a minimum transaction id.
 * This is used by the btree defrag code, and tree logging
5056 5057 5058 5059 5060 5061 5062 5063
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
5064 5065 5066 5067
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
5068 5069 5070 5071
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5072
			 struct btrfs_path *path,
5073 5074 5075 5076 5077
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
5078
	int sret;
5079 5080 5081
	u32 nritems;
	int level;
	int ret = 1;
5082
	int keep_locks = path->keep_locks;
5083

5084
	path->keep_locks = 1;
5085
again:
5086
	cur = btrfs_read_lock_root_node(root);
5087
	level = btrfs_header_level(cur);
5088
	WARN_ON(path->nodes[level]);
5089
	path->nodes[level] = cur;
5090
	path->locks[level] = BTRFS_READ_LOCK;
5091 5092 5093 5094 5095

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
C
Chris Mason 已提交
5096
	while (1) {
5097 5098
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
5099
		sret = btrfs_bin_search(cur, min_key, &slot);
5100 5101 5102 5103
		if (sret < 0) {
			ret = sret;
			goto out;
		}
5104

5105 5106
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
5107 5108
			if (slot >= nritems)
				goto find_next_key;
5109 5110 5111 5112 5113
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
5114 5115
		if (sret && slot > 0)
			slot--;
5116
		/*
5117 5118
		 * check this node pointer against the min_trans parameters.
		 * If it is too old, old, skip to the next one.
5119
		 */
C
Chris Mason 已提交
5120
		while (slot < nritems) {
5121
			u64 gen;
5122

5123 5124 5125 5126 5127
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
5128
			break;
5129
		}
5130
find_next_key:
5131 5132 5133 5134 5135
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
5136
			path->slots[level] = slot;
5137
			btrfs_set_path_blocking(path);
5138
			sret = btrfs_find_next_key(root, path, min_key, level,
5139
						  min_trans);
5140
			if (sret == 0) {
5141
				btrfs_release_path(path);
5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			goto out;
		}
5154
		btrfs_set_path_blocking(path);
5155
		cur = btrfs_read_node_slot(cur, slot);
5156 5157 5158 5159
		if (IS_ERR(cur)) {
			ret = PTR_ERR(cur);
			goto out;
		}
5160

5161
		btrfs_tree_read_lock(cur);
5162

5163
		path->locks[level - 1] = BTRFS_READ_LOCK;
5164
		path->nodes[level - 1] = cur;
5165
		unlock_up(path, level, 1, 0, NULL);
5166 5167
	}
out:
5168 5169 5170 5171
	path->keep_locks = keep_locks;
	if (ret == 0) {
		btrfs_unlock_up_safe(path, path->lowest_level + 1);
		btrfs_set_path_blocking(path);
5172
		memcpy(min_key, &found_key, sizeof(found_key));
5173
	}
5174 5175 5176 5177 5178 5179
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
5180
 * tree based on the current path and the min_trans parameters.
5181 5182 5183 5184 5185 5186 5187
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
5188
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5189
			struct btrfs_key *key, int level, u64 min_trans)
5190 5191 5192 5193
{
	int slot;
	struct extent_buffer *c;

5194
	WARN_ON(!path->keep_locks && !path->skip_locking);
C
Chris Mason 已提交
5195
	while (level < BTRFS_MAX_LEVEL) {
5196 5197 5198 5199 5200
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
5201
next:
5202
		if (slot >= btrfs_header_nritems(c)) {
5203 5204 5205 5206 5207
			int ret;
			int orig_lowest;
			struct btrfs_key cur_key;
			if (level + 1 >= BTRFS_MAX_LEVEL ||
			    !path->nodes[level + 1])
5208
				return 1;
5209

5210
			if (path->locks[level + 1] || path->skip_locking) {
5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221
				level++;
				continue;
			}

			slot = btrfs_header_nritems(c) - 1;
			if (level == 0)
				btrfs_item_key_to_cpu(c, &cur_key, slot);
			else
				btrfs_node_key_to_cpu(c, &cur_key, slot);

			orig_lowest = path->lowest_level;
5222
			btrfs_release_path(path);
5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234
			path->lowest_level = level;
			ret = btrfs_search_slot(NULL, root, &cur_key, path,
						0, 0);
			path->lowest_level = orig_lowest;
			if (ret < 0)
				return ret;

			c = path->nodes[level];
			slot = path->slots[level];
			if (ret == 0)
				slot++;
			goto next;
5235
		}
5236

5237 5238
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
5239 5240 5241 5242 5243 5244 5245
		else {
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (gen < min_trans) {
				slot++;
				goto next;
			}
5246
			btrfs_node_key_to_cpu(c, key, slot);
5247
		}
5248 5249 5250 5251 5252
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
5253
/*
5254
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
5255 5256
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
5257
 */
C
Chris Mason 已提交
5258
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
J
Jan Schmidt 已提交
5259 5260 5261 5262 5263 5264
{
	return btrfs_next_old_leaf(root, path, 0);
}

int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
			u64 time_seq)
5265 5266
{
	int slot;
5267
	int level;
5268
	struct extent_buffer *c;
5269
	struct extent_buffer *next;
5270 5271 5272
	struct btrfs_key key;
	u32 nritems;
	int ret;
5273
	int old_spinning = path->leave_spinning;
5274
	int next_rw_lock = 0;
5275 5276

	nritems = btrfs_header_nritems(path->nodes[0]);
C
Chris Mason 已提交
5277
	if (nritems == 0)
5278 5279
		return 1;

5280 5281 5282 5283
	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
again:
	level = 1;
	next = NULL;
5284
	next_rw_lock = 0;
5285
	btrfs_release_path(path);
5286

5287
	path->keep_locks = 1;
5288
	path->leave_spinning = 1;
5289

J
Jan Schmidt 已提交
5290 5291 5292 5293
	if (time_seq)
		ret = btrfs_search_old_slot(root, &key, path, time_seq);
	else
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5294 5295 5296 5297 5298
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

5299
	nritems = btrfs_header_nritems(path->nodes[0]);
5300 5301 5302 5303 5304 5305
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
5306
	if (nritems > 0 && path->slots[0] < nritems - 1) {
5307 5308
		if (ret == 0)
			path->slots[0]++;
5309
		ret = 0;
5310 5311
		goto done;
	}
5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329
	/*
	 * So the above check misses one case:
	 * - after releasing the path above, someone has removed the item that
	 *   used to be at the very end of the block, and balance between leafs
	 *   gets another one with bigger key.offset to replace it.
	 *
	 * This one should be returned as well, or we can get leaf corruption
	 * later(esp. in __btrfs_drop_extents()).
	 *
	 * And a bit more explanation about this check,
	 * with ret > 0, the key isn't found, the path points to the slot
	 * where it should be inserted, so the path->slots[0] item must be the
	 * bigger one.
	 */
	if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
		ret = 0;
		goto done;
	}
5330

C
Chris Mason 已提交
5331
	while (level < BTRFS_MAX_LEVEL) {
5332 5333 5334 5335
		if (!path->nodes[level]) {
			ret = 1;
			goto done;
		}
5336

5337 5338
		slot = path->slots[level] + 1;
		c = path->nodes[level];
5339
		if (slot >= btrfs_header_nritems(c)) {
5340
			level++;
5341 5342 5343 5344
			if (level == BTRFS_MAX_LEVEL) {
				ret = 1;
				goto done;
			}
5345 5346
			continue;
		}
5347

5348
		if (next) {
5349
			btrfs_tree_unlock_rw(next, next_rw_lock);
5350
			free_extent_buffer(next);
5351
		}
5352

5353
		next = c;
5354
		next_rw_lock = path->locks[level];
5355
		ret = read_block_for_search(root, path, &next, level,
5356
					    slot, &key);
5357 5358
		if (ret == -EAGAIN)
			goto again;
5359

5360
		if (ret < 0) {
5361
			btrfs_release_path(path);
5362 5363 5364
			goto done;
		}

5365
		if (!path->skip_locking) {
5366
			ret = btrfs_try_tree_read_lock(next);
5367 5368 5369 5370 5371 5372 5373 5374
			if (!ret && time_seq) {
				/*
				 * If we don't get the lock, we may be racing
				 * with push_leaf_left, holding that lock while
				 * itself waiting for the leaf we've currently
				 * locked. To solve this situation, we give up
				 * on our lock and cycle.
				 */
5375
				free_extent_buffer(next);
5376 5377 5378 5379
				btrfs_release_path(path);
				cond_resched();
				goto again;
			}
5380 5381
			if (!ret) {
				btrfs_set_path_blocking(path);
5382
				btrfs_tree_read_lock(next);
5383
			}
5384
			next_rw_lock = BTRFS_READ_LOCK;
5385
		}
5386 5387 5388
		break;
	}
	path->slots[level] = slot;
C
Chris Mason 已提交
5389
	while (1) {
5390 5391
		level--;
		c = path->nodes[level];
5392
		if (path->locks[level])
5393
			btrfs_tree_unlock_rw(c, path->locks[level]);
5394

5395
		free_extent_buffer(c);
5396 5397
		path->nodes[level] = next;
		path->slots[level] = 0;
5398
		if (!path->skip_locking)
5399
			path->locks[level] = next_rw_lock;
5400 5401
		if (!level)
			break;
5402

5403
		ret = read_block_for_search(root, path, &next, level,
5404
					    0, &key);
5405 5406 5407
		if (ret == -EAGAIN)
			goto again;

5408
		if (ret < 0) {
5409
			btrfs_release_path(path);
5410 5411 5412
			goto done;
		}

5413
		if (!path->skip_locking) {
5414
			ret = btrfs_try_tree_read_lock(next);
5415 5416
			if (!ret) {
				btrfs_set_path_blocking(path);
5417 5418
				btrfs_tree_read_lock(next);
			}
5419
			next_rw_lock = BTRFS_READ_LOCK;
5420
		}
5421
	}
5422
	ret = 0;
5423
done:
5424
	unlock_up(path, 0, 1, 0, NULL);
5425 5426 5427 5428 5429
	path->leave_spinning = old_spinning;
	if (!old_spinning)
		btrfs_set_path_blocking(path);

	return ret;
5430
}
5431

5432 5433 5434 5435 5436 5437
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
5438 5439 5440 5441 5442 5443
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
5444
	u32 nritems;
5445 5446
	int ret;

C
Chris Mason 已提交
5447
	while (1) {
5448
		if (path->slots[0] == 0) {
5449
			btrfs_set_path_blocking(path);
5450 5451 5452 5453 5454 5455 5456
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
5457 5458 5459 5460 5461 5462
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

5463
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5464 5465
		if (found_key.objectid < min_objectid)
			break;
5466 5467
		if (found_key.type == type)
			return 0;
5468 5469 5470
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
5471 5472 5473
	}
	return 1;
}
5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516

/*
 * search in extent tree to find a previous Metadata/Data extent item with
 * min objecitd.
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
int btrfs_previous_extent_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
	u32 nritems;
	int ret;

	while (1) {
		if (path->slots[0] == 0) {
			btrfs_set_path_blocking(path);
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
		    found_key.type == BTRFS_METADATA_ITEM_KEY)
			return 0;
		if (found_key.objectid == min_objectid &&
		    found_key.type < BTRFS_EXTENT_ITEM_KEY)
			break;
	}
	return 1;
}