ctree.c 99.9 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
inline void btrfs_init_path(struct btrfs_path *p)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	memset(p, 0, sizeof(*p));
C
Chris Mason 已提交
44 45
}

C
Chris Mason 已提交
46
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
47
{
C
Chris Mason 已提交
48 49
	struct btrfs_path *path;
	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50
	if (path) {
C
Chris Mason 已提交
51
		btrfs_init_path(path);
52 53
		path->reada = 1;
	}
C
Chris Mason 已提交
54
	return path;
C
Chris Mason 已提交
55 56
}

C
Chris Mason 已提交
57
/* this also releases the path */
C
Chris Mason 已提交
58
void btrfs_free_path(struct btrfs_path *p)
59
{
C
Chris Mason 已提交
60 61
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
62 63
}

C
Chris Mason 已提交
64 65 66 67 68 69
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
70
void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 72
{
	int i;
73

C
Chris Mason 已提交
74
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75
		p->slots[i] = 0;
76
		if (!p->nodes[i])
77 78 79 80 81
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
82
		free_extent_buffer(p->nodes[i]);
83
		p->nodes[i] = NULL;
84 85 86
	}
}

C
Chris Mason 已提交
87 88 89 90 91 92 93 94 95 96
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
97 98 99 100 101 102 103 104 105 106
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
107 108 109 110
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

	while(1) {
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
132 133 134 135
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
136 137 138 139 140 141 142 143
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
144 145 146 147 148
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
149 150 151 152 153 154 155 156 157
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
158
	struct btrfs_root *new_root;
159

160 161 162 163 164 165
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
166 167 168 169 170 171 172

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
173 174 175 176

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
177 178
	if (IS_ERR(cow)) {
		kfree(new_root);
179
		return PTR_ERR(cow);
180
	}
181 182 183 184 185

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
186
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187 188

	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
189
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
190 191
	kfree(new_root);

192 193 194 195 196 197 198 199
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
/*
 * does the dirty work in cow of a single block.  The parent block
 * (if supplied) is updated to point to the new cow copy.  The new
 * buffer is marked dirty and returned locked.  If you modify the block
 * it needs to be marked dirty again.
 *
 * search_start -- an allocation hint for the new block
 *
 * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
 * the allocator should try to find free next to the block it returns.  This is
 * just a hint and may be ignored by the allocator.
 *
 * prealloc_dest -- if you have already reserved a destination for the cow,
 * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
 * is used to finish the allocation.
 */
216
int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
217 218 219 220
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
221 222
			     u64 search_start, u64 empty_size,
			     u64 prealloc_dest)
C
Chris Mason 已提交
223
{
Z
Zheng Yan 已提交
224
	u64 parent_start;
225
	struct extent_buffer *cow;
226
	u32 nritems;
227
	int ret = 0;
228
	int level;
229
	int unlock_orig = 0;
230

231 232 233 234 235
	if (*cow_ret == buf)
		unlock_orig = 1;

	WARN_ON(!btrfs_tree_locked(buf));

Z
Zheng Yan 已提交
236 237 238 239 240
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

241 242
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
243
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
244

245 246
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
247

248 249 250 251 252 253 254
	if (prealloc_dest) {
		struct btrfs_key ins;

		ins.objectid = prealloc_dest;
		ins.offset = buf->len;
		ins.type = BTRFS_EXTENT_ITEM_KEY;

Z
Zheng Yan 已提交
255
		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
256
						  root->root_key.objectid,
257
						  trans->transid, level, &ins);
258 259 260 261 262
		BUG_ON(ret);
		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
					    buf->len);
	} else {
		cow = btrfs_alloc_free_block(trans, root, buf->len,
Z
Zheng Yan 已提交
263
					     parent_start,
264
					     root->root_key.objectid,
Z
Zheng Yan 已提交
265 266
					     trans->transid, level,
					     search_start, empty_size);
267
	}
268 269
	if (IS_ERR(cow))
		return PTR_ERR(cow);
270

271
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
272
	btrfs_set_header_bytenr(cow, cow->start);
273 274
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
275
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
276

277 278
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
279 280
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
281 282
		if (ret)
			return ret;
Z
Zheng Yan 已提交
283 284 285

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
286 287 288 289
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
290
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
307
	} else {
Z
Zheng Yan 已提交
308 309 310
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
311 312 313
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
314 315 316 317 318
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
319
	if (buf == root->node) {
320 321 322
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
323
		root->node = cow;
324
		extent_buffer_get(cow);
325 326
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
327
		if (buf != root->commit_root) {
328
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
329 330 331
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
332
					  level, 1);
C
Chris Mason 已提交
333
		}
334
		free_extent_buffer(buf);
335
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
336
	} else {
337
		btrfs_set_node_blockptr(parent, parent_slot,
338
					cow->start);
339 340 341
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
342
		btrfs_mark_buffer_dirty(parent);
343
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
344
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
345
				  parent_start, btrfs_header_owner(parent),
346
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
347
	}
348 349
	if (unlock_orig)
		btrfs_tree_unlock(buf);
350
	free_extent_buffer(buf);
C
Chris Mason 已提交
351
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
352
	*cow_ret = cow;
C
Chris Mason 已提交
353 354 355
	return 0;
}

C
Chris Mason 已提交
356 357 358 359 360
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
361
int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
362 363
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
364
		    struct extent_buffer **cow_ret, u64 prealloc_dest)
365 366
{
	u64 search_start;
367
	int ret;
C
Chris Mason 已提交
368

369 370 371 372 373 374 375 376 377 378
	if (trans->transaction != root->fs_info->running_transaction) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->generation);
		WARN_ON(1);
	}
C
Chris Mason 已提交
379

380
	spin_lock(&root->fs_info->hash_lock);
381 382
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
383
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
384
		*cow_ret = buf;
385
		spin_unlock(&root->fs_info->hash_lock);
386
		WARN_ON(prealloc_dest);
387 388
		return 0;
	}
389
	spin_unlock(&root->fs_info->hash_lock);
390
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
391
	ret = __btrfs_cow_block(trans, root, buf, parent,
392 393
				 parent_slot, cow_ret, search_start, 0,
				 prealloc_dest);
394
	return ret;
395 396
}

C
Chris Mason 已提交
397 398 399 400
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
401
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
402
{
403
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
404
		return 1;
405
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
406 407 408 409
		return 1;
	return 0;
}

410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
453

C
Chris Mason 已提交
454 455 456 457 458
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
459
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
460
		       struct btrfs_root *root, struct extent_buffer *parent,
461 462
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
463
{
464
	struct extent_buffer *cur;
465
	u64 blocknr;
466
	u64 gen;
467 468
	u64 search_start = *last_ret;
	u64 last_block = 0;
469 470 471 472 473
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
474
	int parent_level;
475 476
	int uptodate;
	u32 blocksize;
477 478
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
479

480 481 482 483
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

484 485 486 487 488 489 490 491 492 493
	if (trans->transaction != root->fs_info->running_transaction) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->generation);
		WARN_ON(1);
	}
494

495 496
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
497 498 499 500 501 502 503
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
504

505 506 507 508 509 510 511 512
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
513 514 515 516 517
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
518
		blocknr = btrfs_node_blockptr(parent, i);
519
		gen = btrfs_node_ptr_generation(parent, i);
520 521
		if (last_block == 0)
			last_block = blocknr;
522

523
		if (i > 0) {
524 525
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
526
		}
C
Chris Mason 已提交
527
		if (!close && i < end_slot - 2) {
528 529
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
530
		}
531 532
		if (close) {
			last_block = blocknr;
533
			continue;
534
		}
535 536 537 538 539
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
540

541 542
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
543
			uptodate = btrfs_buffer_uptodate(cur, gen);
544 545
		else
			uptodate = 0;
546
		if (!cur || !uptodate) {
547
			if (cache_only) {
548
				free_extent_buffer(cur);
549 550
				continue;
			}
551 552
			if (!cur) {
				cur = read_tree_block(root, blocknr,
553
							 blocksize, gen);
554
			} else if (!uptodate) {
555
				btrfs_read_buffer(cur, gen);
556
			}
557
		}
558
		if (search_start == 0)
559
			search_start = last_block;
560

561
		btrfs_tree_lock(cur);
562
		err = __btrfs_cow_block(trans, root, cur, parent, i,
563
					&cur, search_start,
564
					min(16 * blocksize,
565
					    (end_slot - i) * blocksize), 0);
Y
Yan 已提交
566
		if (err) {
567
			btrfs_tree_unlock(cur);
568
			free_extent_buffer(cur);
569
			break;
Y
Yan 已提交
570
		}
571 572
		search_start = cur->start;
		last_block = cur->start;
573
		*last_ret = search_start;
574 575
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
576
	}
577 578 579 580 581
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
582 583 584
	return err;
}

C
Chris Mason 已提交
585 586 587 588 589
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
590
static inline unsigned int leaf_data_end(struct btrfs_root *root,
591
					 struct extent_buffer *leaf)
592
{
593
	u32 nr = btrfs_header_nritems(leaf);
594
	if (nr == 0)
C
Chris Mason 已提交
595
		return BTRFS_LEAF_DATA_SIZE(root);
596
	return btrfs_item_offset_nr(leaf, nr - 1);
597 598
}

C
Chris Mason 已提交
599 600 601 602
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
603 604
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
605
{
606 607 608 609
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
610
	int parent_slot;
611 612
	int slot;
	struct btrfs_key cpukey;
613
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
614 615

	if (path->nodes[level + 1])
616
		parent = path->nodes[level + 1];
A
Aneesh 已提交
617

618
	slot = path->slots[level];
619 620
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
621
		parent_slot = path->slots[level + 1];
622 623 624
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
625
			      sizeof(struct btrfs_disk_key)));
626
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
627
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
628
	}
C
Chris Mason 已提交
629
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
630
	if (slot != 0) {
631 632 633
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
634 635
	}
	if (slot < nritems - 1) {
636 637 638
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
639 640 641 642
	}
	return 0;
}

C
Chris Mason 已提交
643 644 645 646
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
647 648
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
649
{
650 651
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
652
	int parent_slot;
653
	struct btrfs_key cpukey;
654 655 656
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
657

658
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
659 660

	if (path->nodes[level + 1])
661
		parent = path->nodes[level + 1];
662 663 664 665 666

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
667
		parent_slot = path->slots[level + 1];
668 669
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
670

671
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
672
		       sizeof(struct btrfs_disk_key)));
673
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
674
		       btrfs_header_bytenr(leaf));
675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699
	}
#if 0
	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
		btrfs_item_key(leaf, &leaf_key, i);
		if (comp_keys(&leaf_key, &cpukey) >= 0) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad key\n", i);
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, i) !=
			btrfs_item_end_nr(leaf, i + 1)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", i);
			BUG_ON(1);
		}
		if (i == 0) {
			if (btrfs_item_offset_nr(leaf, i) +
			       btrfs_item_size_nr(leaf, i) !=
			       BTRFS_LEAF_DATA_SIZE(root)) {
				btrfs_print_leaf(root, leaf);
				printk("slot %d first offset bad\n", i);
				BUG_ON(1);
			}
		}
C
Chris Mason 已提交
700
	}
701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722
	if (nritems > 0) {
		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
				btrfs_print_leaf(root, leaf);
				printk("slot %d bad size \n", nritems - 1);
				BUG_ON(1);
		}
	}
#endif
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad key\n", slot);
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", slot);
			BUG_ON(1);
		}
723 724
	}
	if (slot < nritems - 1) {
725 726 727 728 729 730 731 732 733
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", slot);
			BUG_ON(1);
		}
C
Chris Mason 已提交
734
	}
735 736
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
737 738 739
	return 0;
}

740 741
static int noinline check_block(struct btrfs_root *root,
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
742
{
743
	u64 found_start;
744
	return 0;
745 746 747 748 749 750 751 752 753
	if (btrfs_header_level(path->nodes[level]) != level)
	    printk("warning: bad level %Lu wanted %d found %d\n",
		   path->nodes[level]->start, level,
		   btrfs_header_level(path->nodes[level]));
	found_start = btrfs_header_bytenr(path->nodes[level]);
	if (found_start != path->nodes[level]->start) {
	    printk("warning: bad bytentr %Lu found %Lu\n",
		   path->nodes[level]->start, found_start);
	}
754
#if 0
755 756
	struct extent_buffer *buf = path->nodes[level];

757 758 759
	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
				 (unsigned long)btrfs_header_fsid(buf),
				 BTRFS_FSID_SIZE)) {
760
		printk("warning bad block %Lu\n", buf->start);
761
		return 1;
762
	}
763
#endif
C
Chris Mason 已提交
764
	if (level == 0)
C
Chris Mason 已提交
765 766
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
767 768
}

C
Chris Mason 已提交
769
/*
770 771 772
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
773 774 775 776 777 778
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
779 780 781 782
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
783 784 785 786 787
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
788
	struct btrfs_disk_key *tmp = NULL;
789 790 791 792 793 794
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
795
	int err;
796 797 798

	while(low < high) {
		mid = (low + high) / 2;
799 800 801 802 803
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
804
			if (map_token) {
805
				unmap_extent_buffer(eb, map_token, KM_USER0);
806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
				map_token = NULL;
			}
			err = map_extent_buffer(eb, offset,
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
821 822 823 824 825

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
826 827 828 829 830 831 832 833
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
834 835
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
836 837 838 839
			return 0;
		}
	}
	*slot = low;
840 841
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
842 843 844
	return 1;
}

C
Chris Mason 已提交
845 846 847 848
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
849 850
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
851
{
852 853 854
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
855
					  sizeof(struct btrfs_item),
856
					  key, btrfs_header_nritems(eb),
857
					  slot);
858
	} else {
859 860
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
861
					  sizeof(struct btrfs_key_ptr),
862
					  key, btrfs_header_nritems(eb),
863
					  slot);
864 865 866 867
	}
	return -1;
}

C
Chris Mason 已提交
868 869 870 871
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
872
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
873
				   struct extent_buffer *parent, int slot)
874
{
875
	int level = btrfs_header_level(parent);
876 877
	if (slot < 0)
		return NULL;
878
	if (slot >= btrfs_header_nritems(parent))
879
		return NULL;
880 881 882

	BUG_ON(level == 0);

883
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
884 885
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
886 887
}

C
Chris Mason 已提交
888 889 890 891 892
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
893
static noinline int balance_level(struct btrfs_trans_handle *trans,
894 895
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
896
{
897 898 899 900
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
901 902 903 904
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
905
	int err_on_enospc = 0;
906
	u64 orig_ptr;
907 908 909 910

	if (level == 0)
		return 0;

911
	mid = path->nodes[level];
912
	WARN_ON(!path->locks[level]);
913 914
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

915
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
916

C
Chris Mason 已提交
917
	if (level < BTRFS_MAX_LEVEL - 1)
918
		parent = path->nodes[level + 1];
919 920
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
921 922 923 924
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
925 926
	if (!parent) {
		struct extent_buffer *child;
927

928
		if (btrfs_header_nritems(mid) != 1)
929 930 931
			return 0;

		/* promote the child to a root */
932
		child = read_node_slot(root, mid, 0);
933
		btrfs_tree_lock(child);
934
		BUG_ON(!child);
935
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
936 937
		BUG_ON(ret);

938
		spin_lock(&root->node_lock);
939
		root->node = child;
940 941
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
942 943 944
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
945
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
946 947
		BUG_ON(ret);

948
		add_root_to_dirty_list(root);
949 950
		btrfs_tree_unlock(child);
		path->locks[level] = 0;
951
		path->nodes[level] = NULL;
952
		clean_tree_block(trans, root, mid);
953
		btrfs_tree_unlock(mid);
954
		/* once for the path */
955
		free_extent_buffer(mid);
956
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
957
					mid->start, root->root_key.objectid,
958 959
					btrfs_header_generation(mid),
					level, 1);
960
		/* once for the root ptr */
961
		free_extent_buffer(mid);
962
		return ret;
963
	}
964
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
965
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
966 967
		return 0;

968
	if (btrfs_header_nritems(mid) < 2)
969 970
		err_on_enospc = 1;

971 972
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
973
		btrfs_tree_lock(left);
974
		wret = btrfs_cow_block(trans, root, left,
975
				       parent, pslot - 1, &left, 0);
976 977 978 979
		if (wret) {
			ret = wret;
			goto enospc;
		}
980
	}
981 982
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
983
		btrfs_tree_lock(right);
984
		wret = btrfs_cow_block(trans, root, right,
985
				       parent, pslot + 1, &right, 0);
986 987 988 989 990 991 992
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
993 994
	if (left) {
		orig_slot += btrfs_header_nritems(left);
995
		wret = push_node_left(trans, root, left, mid, 1);
996 997
		if (wret < 0)
			ret = wret;
998
		if (btrfs_header_nritems(mid) < 2)
999
			err_on_enospc = 1;
1000
	}
1001 1002 1003 1004

	/*
	 * then try to empty the right most buffer into the middle
	 */
1005
	if (right) {
1006
		wret = push_node_left(trans, root, mid, right, 1);
1007
		if (wret < 0 && wret != -ENOSPC)
1008
			ret = wret;
1009
		if (btrfs_header_nritems(right) == 0) {
1010
			u64 bytenr = right->start;
1011
			u64 generation = btrfs_header_generation(parent);
1012 1013
			u32 blocksize = right->len;

1014
			clean_tree_block(trans, root, right);
1015
			btrfs_tree_unlock(right);
1016
			free_extent_buffer(right);
1017
			right = NULL;
1018 1019
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
1020 1021
			if (wret)
				ret = wret;
1022
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
1023
						 blocksize, parent->start,
1024
						 btrfs_header_owner(parent),
1025
						 generation, level, 1);
1026 1027 1028
			if (wret)
				ret = wret;
		} else {
1029 1030 1031 1032
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1033 1034
		}
	}
1035
	if (btrfs_header_nritems(mid) == 1) {
1036 1037 1038 1039 1040 1041 1042 1043 1044
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1045 1046
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1047
		if (wret < 0) {
1048
			ret = wret;
1049 1050
			goto enospc;
		}
1051 1052 1053 1054 1055
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1056 1057
		BUG_ON(wret == 1);
	}
1058
	if (btrfs_header_nritems(mid) == 0) {
1059
		/* we've managed to empty the middle node, drop it */
1060
		u64 root_gen = btrfs_header_generation(parent);
1061 1062
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1063

1064
		clean_tree_block(trans, root, mid);
1065
		btrfs_tree_unlock(mid);
1066
		free_extent_buffer(mid);
1067
		mid = NULL;
1068
		wret = del_ptr(trans, root, path, level + 1, pslot);
1069 1070
		if (wret)
			ret = wret;
1071
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1072
					 parent->start,
1073
					 btrfs_header_owner(parent),
1074
					 root_gen, level, 1);
1075 1076
		if (wret)
			ret = wret;
1077 1078
	} else {
		/* update the parent key to reflect our changes */
1079 1080 1081 1082
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1083
	}
1084

1085
	/* update the path */
1086 1087 1088
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1089
			/* left was locked after cow */
1090
			path->nodes[level] = left;
1091 1092
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1093 1094
			if (mid) {
				btrfs_tree_unlock(mid);
1095
				free_extent_buffer(mid);
1096
			}
1097
		} else {
1098
			orig_slot -= btrfs_header_nritems(left);
1099 1100 1101
			path->slots[level] = orig_slot;
		}
	}
1102
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1103
	check_block(root, path, level);
C
Chris Mason 已提交
1104
	if (orig_ptr !=
1105
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1106
		BUG();
1107
enospc:
1108 1109
	if (right) {
		btrfs_tree_unlock(right);
1110
		free_extent_buffer(right);
1111 1112 1113 1114
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1115
		free_extent_buffer(left);
1116
	}
1117 1118 1119
	return ret;
}

C
Chris Mason 已提交
1120 1121 1122 1123
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
1124 1125 1126
static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1127
{
1128 1129 1130 1131
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1132 1133 1134 1135 1136 1137 1138 1139 1140
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1141
	mid = path->nodes[level];
1142
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1143 1144 1145
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1146
		parent = path->nodes[level + 1];
1147 1148
	pslot = path->slots[level + 1];

1149
	if (!parent)
1150 1151
		return 1;

1152
	left = read_node_slot(root, parent, pslot - 1);
1153 1154

	/* first, try to make some room in the middle buffer */
1155
	if (left) {
1156
		u32 left_nr;
1157 1158

		btrfs_tree_lock(left);
1159
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1160 1161 1162
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1163
			ret = btrfs_cow_block(trans, root, left, parent,
1164
					      pslot - 1, &left, 0);
1165 1166 1167 1168
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1169
						      left, mid, 0);
1170
			}
C
Chris Mason 已提交
1171
		}
1172 1173 1174
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1175
			struct btrfs_disk_key disk_key;
1176
			orig_slot += left_nr;
1177 1178 1179 1180 1181
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1182 1183
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1184
				btrfs_tree_unlock(mid);
1185
				free_extent_buffer(mid);
1186 1187
			} else {
				orig_slot -=
1188
					btrfs_header_nritems(left);
1189
				path->slots[level] = orig_slot;
1190
				btrfs_tree_unlock(left);
1191
				free_extent_buffer(left);
1192 1193 1194
			}
			return 0;
		}
1195
		btrfs_tree_unlock(left);
1196
		free_extent_buffer(left);
1197
	}
1198
	right = read_node_slot(root, parent, pslot + 1);
1199 1200 1201 1202

	/*
	 * then try to empty the right most buffer into the middle
	 */
1203
	if (right) {
C
Chris Mason 已提交
1204
		u32 right_nr;
1205
		btrfs_tree_lock(right);
1206
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1207 1208 1209
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1210 1211
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1212
					      &right, 0);
1213 1214 1215 1216
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1217
							  right, mid);
1218
			}
C
Chris Mason 已提交
1219
		}
1220 1221 1222
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1223 1224 1225 1226 1227 1228 1229 1230
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1231 1232
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1233
					btrfs_header_nritems(mid);
1234
				btrfs_tree_unlock(mid);
1235
				free_extent_buffer(mid);
1236
			} else {
1237
				btrfs_tree_unlock(right);
1238
				free_extent_buffer(right);
1239 1240 1241
			}
			return 0;
		}
1242
		btrfs_tree_unlock(right);
1243
		free_extent_buffer(right);
1244 1245 1246 1247
	}
	return 1;
}

1248
/*
C
Chris Mason 已提交
1249 1250
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1251
 */
1252 1253 1254
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1255
{
1256
	struct extent_buffer *node;
1257
	struct btrfs_disk_key disk_key;
1258 1259
	u32 nritems;
	u64 search;
1260 1261 1262
	u64 lowest_read;
	u64 highest_read;
	u64 nread = 0;
1263
	int direction = path->reada;
1264
	struct extent_buffer *eb;
1265 1266 1267
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1268

1269
	if (level != 1)
1270 1271 1272
		return;

	if (!path->nodes[level])
1273 1274
		return;

1275
	node = path->nodes[level];
1276

1277
	search = btrfs_node_blockptr(node, slot);
1278 1279
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1280 1281
	if (eb) {
		free_extent_buffer(eb);
1282 1283 1284
		return;
	}

1285 1286 1287
	highest_read = search;
	lowest_read = search;

1288
	nritems = btrfs_header_nritems(node);
1289
	nr = slot;
1290
	while(1) {
1291 1292 1293 1294 1295 1296 1297 1298
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1299
		}
1300 1301 1302 1303 1304
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1305 1306
		search = btrfs_node_blockptr(node, nr);
		if ((search >= lowest_read && search <= highest_read) ||
1307 1308
		    (search < lowest_read && lowest_read - search <= 16384) ||
		    (search > highest_read && search - highest_read <= 16384)) {
1309 1310
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1311 1312 1313
			nread += blocksize;
		}
		nscan++;
1314
		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1315
			break;
1316
		if(nread > (256 * 1024) || nscan > 128)
1317 1318 1319 1320 1321 1322
			break;

		if (search < lowest_read)
			lowest_read = search;
		if (search > highest_read)
			highest_read = search;
1323 1324
	}
}
1325

C
Chris Mason 已提交
1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338
/*
 * when we walk down the tree, it is usually safe to unlock the higher layers in
 * the tree.  The exceptions are when our path goes through slot 0, because operations
 * on the tree might require changing key pointers higher up in the tree.
 *
 * callers might also have set path->keep_locks, which tells this code to
 * keep the lock if the path points to the last slot in the block.  This is
 * part of walking through the tree, and selecting the next slot in the higher
 * block.
 *
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
 * so if lowest_unlock is 1, level 0 won't be unlocked
 */
1339 1340
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1341 1342 1343
{
	int i;
	int skip_level = level;
1344
	int no_skips = 0;
1345 1346 1347 1348 1349 1350 1351
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1352
		if (!no_skips && path->slots[i] == 0) {
1353 1354 1355
			skip_level = i + 1;
			continue;
		}
1356
		if (!no_skips && path->keep_locks) {
1357 1358 1359
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1360
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1361 1362 1363 1364
				skip_level = i + 1;
				continue;
			}
		}
1365 1366 1367
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1368 1369 1370 1371 1372 1373 1374 1375
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

C
Chris Mason 已提交
1376 1377 1378 1379 1380 1381
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1382 1383
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1384 1385 1386 1387
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1388
 */
1389 1390 1391
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1392
{
1393
	struct extent_buffer *b;
1394
	struct extent_buffer *tmp;
1395 1396 1397
	int slot;
	int ret;
	int level;
1398
	int should_reada = p->reada;
1399
	int lowest_unlock = 1;
1400
	int blocksize;
1401
	u8 lowest_level = 0;
1402 1403
	u64 blocknr;
	u64 gen;
1404
	struct btrfs_key prealloc_block;
1405

1406
	lowest_level = p->lowest_level;
1407
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1408
	WARN_ON(p->nodes[0] != NULL);
1409

1410 1411
	if (ins_len < 0)
		lowest_unlock = 2;
1412 1413 1414

	prealloc_block.objectid = 0;

1415
again:
1416 1417 1418 1419
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1420

1421
	while (b) {
1422
		level = btrfs_header_level(b);
1423 1424 1425 1426 1427 1428 1429 1430 1431

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1432 1433
		if (cow) {
			int wret;
1434 1435 1436 1437

			/* is a cow on this block not required */
			spin_lock(&root->fs_info->hash_lock);
			if (btrfs_header_generation(b) == trans->transid &&
1438
			    btrfs_header_owner(b) == root->root_key.objectid &&
1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				spin_unlock(&root->fs_info->hash_lock);
				goto cow_done;
			}
			spin_unlock(&root->fs_info->hash_lock);

			/* ok, we have to cow, is our old prealloc the right
			 * size?
			 */
			if (prealloc_block.objectid &&
			    prealloc_block.offset != b->len) {
				btrfs_free_reserved_extent(root,
					   prealloc_block.objectid,
					   prealloc_block.offset);
				prealloc_block.objectid = 0;
			}

			/*
			 * for higher level blocks, try not to allocate blocks
			 * with the block and the parent locks held.
			 */
			if (level > 1 && !prealloc_block.objectid &&
			    btrfs_path_lock_waiting(p, level)) {
				u32 size = b->len;
				u64 hint = b->start;

				btrfs_release_path(root, p);
				ret = btrfs_reserve_extent(trans, root,
							   size, size, 0,
							   hint, (u64)-1,
							   &prealloc_block, 0);
				BUG_ON(ret);
				goto again;
			}

C
Chris Mason 已提交
1474 1475 1476
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
					       p->slots[level + 1],
1477 1478
					       &b, prealloc_block.objectid);
			prealloc_block.objectid = 0;
1479
			if (wret) {
1480
				free_extent_buffer(b);
1481 1482
				ret = wret;
				goto done;
1483
			}
C
Chris Mason 已提交
1484
		}
1485
cow_done:
C
Chris Mason 已提交
1486
		BUG_ON(!cow && ins_len);
1487
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1488
			WARN_ON(1);
1489
		level = btrfs_header_level(b);
1490

1491
		p->nodes[level] = b;
1492 1493
		if (!p->skip_locking)
			p->locks[level] = 1;
1494

C
Chris Mason 已提交
1495
		ret = check_block(root, p, level);
1496 1497 1498 1499
		if (ret) {
			ret = -1;
			goto done;
		}
1500

1501 1502
		ret = bin_search(b, key, level, &slot);
		if (level != 0) {
1503 1504 1505
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1506
			if (ins_len > 0 && btrfs_header_nritems(b) >=
1507
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1508
				int sret = split_node(trans, root, p, level);
C
Chris Mason 已提交
1509
				BUG_ON(sret > 0);
1510 1511 1512 1513
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1514 1515
				b = p->nodes[level];
				slot = p->slots[level];
1516
			} else if (ins_len < 0) {
1517 1518
				int sret = balance_level(trans, root, p,
							 level);
1519 1520 1521 1522
				if (sret) {
					ret = sret;
					goto done;
				}
1523
				b = p->nodes[level];
1524 1525
				if (!b) {
					btrfs_release_path(NULL, p);
1526
					goto again;
1527
				}
1528
				slot = p->slots[level];
1529
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1530
			}
1531 1532
			unlock_up(p, level, lowest_unlock);

1533
			/* this is only true while dropping a snapshot */
1534
			if (level == lowest_level) {
1535 1536
				ret = 0;
				goto done;
1537
			}
1538

1539 1540 1541 1542 1543 1544
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1545 1546 1547 1548 1549 1550 1551 1552 1553
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
				if (level > 1) {
					btrfs_release_path(NULL, p);
1554 1555
					if (tmp)
						free_extent_buffer(tmp);
1556 1557 1558 1559 1560
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1561 1562
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1563 1564 1565 1566
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1567 1568
					if (tmp)
						free_extent_buffer(tmp);
1569 1570 1571 1572
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1573 1574 1575
					b = read_node_slot(root, b, slot);
				}
			}
1576 1577
			if (!p->skip_locking)
				btrfs_tree_lock(b);
1578 1579
		} else {
			p->slots[level] = slot;
1580
			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
C
Chris Mason 已提交
1581
			    sizeof(struct btrfs_item) + ins_len) {
1582
				int sret = split_leaf(trans, root, key,
1583
						      p, ins_len, ret == 0);
C
Chris Mason 已提交
1584
				BUG_ON(sret > 0);
1585 1586 1587 1588
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1589
			}
1590
			unlock_up(p, level, lowest_unlock);
1591
			goto done;
1592 1593
		}
	}
1594 1595 1596 1597 1598 1599 1600 1601 1602
	ret = 1;
done:
	if (prealloc_block.objectid) {
		btrfs_free_reserved_extent(root,
			   prealloc_block.objectid,
			   prealloc_block.offset);
	}

	return ret;
1603 1604
}

Z
Zheng Yan 已提交
1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
	BUG_ON(ret);

	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1645 1646 1647 1648 1649 1650
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
		}

Z
Zheng Yan 已提交
1651 1652 1653
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1654 1655 1656
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1657 1658 1659
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1660 1661 1662 1663 1664 1665
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1666
				break;
Y
Yan Zheng 已提交
1667
			}
Z
Zheng Yan 已提交
1668

Y
Yan Zheng 已提交
1669 1670 1671 1672 1673
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
			}
Z
Zheng Yan 已提交
1674 1675 1676 1677 1678

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
					      &eb, 0);
			BUG_ON(ret);

Y
Yan Zheng 已提交
1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1705
					level - 1);
Z
Zheng Yan 已提交
1706 1707
		BUG_ON(ret);

Y
Yan Zheng 已提交
1708 1709 1710 1711 1712
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1713
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1714 1715
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1716 1717
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1718 1719 1720 1721 1722 1723 1724
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1725 1726 1727 1728 1729 1730 1731 1732
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1733 1734 1735 1736 1737 1738
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1739 1740 1741
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1742
 */
1743 1744 1745
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1746 1747
{
	int i;
C
Chris Mason 已提交
1748
	int ret = 0;
1749 1750
	struct extent_buffer *t;

C
Chris Mason 已提交
1751
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1752
		int tslot = path->slots[i];
1753
		if (!path->nodes[i])
1754
			break;
1755 1756
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1757
		btrfs_mark_buffer_dirty(path->nodes[i]);
1758 1759 1760
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1761
	return ret;
1762 1763
}

Z
Zheng Yan 已提交
1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1799 1800
/*
 * try to push data from one node into the next node left in the
1801
 * tree.
C
Chris Mason 已提交
1802 1803 1804
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1805
 */
1806 1807
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1808
			  struct extent_buffer *src, int empty)
1809 1810
{
	int push_items = 0;
1811 1812
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1813
	int ret = 0;
1814

1815 1816
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1817
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1818 1819
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1820

1821
	if (!empty && src_nritems <= 8)
1822 1823
		return 1;

1824
	if (push_items <= 0) {
1825
		return 1;
1826
	}
1827

1828
	if (empty) {
1829
		push_items = min(src_nritems, push_items);
1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1842

1843 1844 1845 1846 1847
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
		           push_items * sizeof(struct btrfs_key_ptr));

1848
	if (push_items < src_nritems) {
1849 1850 1851 1852 1853 1854 1855 1856 1857
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1858 1859 1860 1861

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
1874 1875 1876 1877
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
1878 1879 1880 1881 1882 1883 1884
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

1885 1886 1887
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

1888 1889
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1890
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1891
	if (push_items <= 0) {
1892
		return 1;
1893 1894 1895 1896 1897
	}

	if (src_nritems < 4) {
		return 1;
	}
1898 1899 1900

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
1901
	if (max_push >= src_nritems) {
1902
		return 1;
1903
	}
Y
Yan 已提交
1904

1905 1906 1907
	if (max_push < push_items)
		push_items = max_push;

1908 1909 1910 1911
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1912

1913 1914 1915 1916
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
		           push_items * sizeof(struct btrfs_key_ptr));
1917

1918 1919
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
1920

1921 1922
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1923 1924 1925 1926

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
1927
	return ret;
1928 1929
}

C
Chris Mason 已提交
1930 1931 1932 1933
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
1934 1935
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
1936
 */
1937
static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1938 1939
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
1940
{
1941
	u64 lower_gen;
1942 1943
	struct extent_buffer *lower;
	struct extent_buffer *c;
1944
	struct extent_buffer *old;
1945
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
1946
	int ret;
C
Chris Mason 已提交
1947 1948 1949 1950

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

1951 1952 1953 1954 1955 1956
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
1957 1958
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
1959
				   level, root->node->start, 0);
1960 1961
	if (IS_ERR(c))
		return PTR_ERR(c);
1962

1963 1964 1965
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
1966
	btrfs_set_header_bytenr(c, c->start);
1967 1968 1969 1970 1971 1972
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
1973 1974 1975 1976 1977

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

1978
	btrfs_set_node_key(c, &lower_key, 0);
1979
	btrfs_set_node_blockptr(c, 0, lower->start);
1980
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
1981
	WARN_ON(lower_gen != trans->transid);
1982 1983

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
1984

1985
	btrfs_mark_buffer_dirty(c);
1986

1987 1988
	spin_lock(&root->node_lock);
	old = root->node;
1989
	root->node = c;
1990 1991
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
1992 1993 1994
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
1995
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
1996 1997
	BUG_ON(ret);

1998 1999 2000
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

2001
	add_root_to_dirty_list(root);
2002 2003
	extent_buffer_get(c);
	path->nodes[level] = c;
2004
	path->locks[level] = 1;
C
Chris Mason 已提交
2005 2006 2007 2008
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
2009 2010 2011
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
2012
 *
C
Chris Mason 已提交
2013 2014
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
2015 2016
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
2017
 */
2018 2019
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
2020
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
2021
{
2022
	struct extent_buffer *lower;
C
Chris Mason 已提交
2023
	int nritems;
C
Chris Mason 已提交
2024 2025

	BUG_ON(!path->nodes[level]);
2026 2027
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
2028 2029
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
2030
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
2031 2032
		BUG();
	if (slot != nritems) {
2033 2034 2035
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
2036
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
2037
	}
2038
	btrfs_set_node_key(lower, key, slot);
2039
	btrfs_set_node_blockptr(lower, slot, bytenr);
2040 2041
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2042 2043
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
2044 2045 2046
	return 0;
}

C
Chris Mason 已提交
2047 2048 2049 2050 2051 2052
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
2053 2054
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
2055
 */
2056 2057 2058
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2059
{
2060 2061 2062
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2063
	int mid;
C
Chris Mason 已提交
2064
	int ret;
C
Chris Mason 已提交
2065
	int wret;
2066
	u32 c_nritems;
2067

2068
	c = path->nodes[level];
2069
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2070
	if (c == root->node) {
C
Chris Mason 已提交
2071
		/* trying to split the root, lets make a new one */
2072
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2073 2074
		if (ret)
			return ret;
2075 2076
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2077 2078
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2079
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2080
			return 0;
2081 2082
		if (ret < 0)
			return ret;
2083
	}
2084

2085
	c_nritems = btrfs_header_nritems(c);
2086

2087
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2088 2089 2090
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2091 2092 2093 2094 2095
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2096
	btrfs_set_header_bytenr(split, split->start);
2097 2098
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2099
	btrfs_set_header_flags(split, 0);
2100 2101 2102
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2103 2104 2105
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2106

2107
	mid = (c_nritems + 1) / 2;
2108 2109 2110 2111 2112 2113 2114

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2115 2116
	ret = 0;

2117 2118 2119 2120
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2121
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2122
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2123
			  level + 1);
C
Chris Mason 已提交
2124 2125 2126
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2127 2128 2129
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2130
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2131
		path->slots[level] -= mid;
2132
		btrfs_tree_unlock(c);
2133 2134
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2135 2136
		path->slots[level + 1] += 1;
	} else {
2137
		btrfs_tree_unlock(split);
2138
		free_extent_buffer(split);
2139
	}
C
Chris Mason 已提交
2140
	return ret;
2141 2142
}

C
Chris Mason 已提交
2143 2144 2145 2146 2147
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2148
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2149 2150
{
	int data_len;
2151
	int nritems = btrfs_header_nritems(l);
2152
	int end = min(nritems, start + nr) - 1;
2153 2154 2155

	if (!nr)
		return 0;
2156 2157
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2158
	data_len += sizeof(struct btrfs_item) * nr;
2159
	WARN_ON(data_len < 0);
2160 2161 2162
	return data_len;
}

2163 2164 2165 2166 2167
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
2168 2169
int noinline btrfs_leaf_free_space(struct btrfs_root *root,
				   struct extent_buffer *leaf)
2170
{
2171 2172 2173 2174 2175
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
J
Jens Axboe 已提交
2176
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2177 2178 2179
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2180 2181
}

C
Chris Mason 已提交
2182 2183 2184
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2185 2186 2187
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2188
 */
2189
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2190 2191
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2192
{
2193 2194 2195 2196
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2197
	int slot;
2198
	u32 i;
C
Chris Mason 已提交
2199 2200 2201
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2202
	struct btrfs_item *item;
2203
	u32 left_nritems;
2204
	u32 nr;
2205
	u32 right_nritems;
2206
	u32 data_end;
2207
	u32 this_item_size;
2208
	int ret;
C
Chris Mason 已提交
2209 2210 2211 2212 2213 2214

	slot = path->slots[1];
	if (!path->nodes[1]) {
		return 1;
	}
	upper = path->nodes[1];
2215
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2216
		return 1;
2217

2218 2219
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2220
	right = read_node_slot(root, upper, slot + 1);
2221
	btrfs_tree_lock(right);
C
Chris Mason 已提交
2222
	free_space = btrfs_leaf_free_space(root, right);
2223 2224
	if (free_space < data_size + sizeof(struct btrfs_item))
		goto out_unlock;
2225

C
Chris Mason 已提交
2226
	/* cow and double check */
2227
	ret = btrfs_cow_block(trans, root, right, upper,
2228
			      slot + 1, &right, 0);
2229 2230 2231
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2232
	free_space = btrfs_leaf_free_space(root, right);
2233 2234
	if (free_space < data_size + sizeof(struct btrfs_item))
		goto out_unlock;
C
Chris Mason 已提交
2235

2236
	left_nritems = btrfs_header_nritems(left);
2237 2238
	if (left_nritems == 0)
		goto out_unlock;
2239

2240 2241 2242 2243 2244
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2245 2246 2247
	if (path->slots[0] >= left_nritems)
		push_space += data_size + sizeof(*item);

2248 2249
	i = left_nritems - 1;
	while (i >= nr) {
2250
		item = btrfs_item_nr(left, i);
2251

Z
Zheng Yan 已提交
2252 2253 2254 2255 2256 2257 2258 2259 2260 2261
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2262 2263
		if (path->slots[0] == i)
			push_space += data_size + sizeof(*item);
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2275
			break;
Z
Zheng Yan 已提交
2276

C
Chris Mason 已提交
2277
		push_items++;
2278
		push_space += this_item_size + sizeof(*item);
2279 2280 2281
		if (i == 0)
			break;
		i--;
2282 2283 2284 2285
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2286
	}
2287

2288 2289
	if (push_items == 0)
		goto out_unlock;
2290

2291
	if (!empty && push_items == left_nritems)
2292
		WARN_ON(1);
2293

C
Chris Mason 已提交
2294
	/* push left to right */
2295
	right_nritems = btrfs_header_nritems(right);
2296

2297
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2298
	push_space -= leaf_data_end(root, left);
2299

C
Chris Mason 已提交
2300
	/* make room in the right data area */
2301 2302 2303 2304 2305 2306
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2307
	/* copy from the left data area */
2308
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2309 2310 2311
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2312 2313 2314 2315 2316

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2317
	/* copy the items from left to right */
2318 2319 2320
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2321 2322

	/* update the item pointers */
2323
	right_nritems += push_items;
2324
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2325
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2326
	for (i = 0; i < right_nritems; i++) {
2327
		item = btrfs_item_nr(right, i);
2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2342
	}
2343
	left_nritems -= push_items;
2344
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2345

2346 2347
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2348
	btrfs_mark_buffer_dirty(right);
2349

Z
Zheng Yan 已提交
2350 2351 2352
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2353 2354
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2355
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2356

C
Chris Mason 已提交
2357
	/* then fixup the leaf pointer in the path */
2358 2359
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2360 2361 2362
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2363 2364
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2365 2366
		path->slots[1] += 1;
	} else {
2367
		btrfs_tree_unlock(right);
2368
		free_extent_buffer(right);
C
Chris Mason 已提交
2369 2370
	}
	return 0;
2371 2372 2373 2374 2375

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2376
}
2377

C
Chris Mason 已提交
2378 2379 2380 2381
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2382
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2383 2384
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2385
{
2386 2387 2388
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2389 2390 2391 2392 2393
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2394
	struct btrfs_item *item;
2395
	u32 old_left_nritems;
2396
	u32 right_nritems;
2397
	u32 nr;
C
Chris Mason 已提交
2398 2399
	int ret = 0;
	int wret;
2400 2401
	u32 this_item_size;
	u32 old_left_item_size;
2402 2403

	slot = path->slots[1];
2404
	if (slot == 0)
2405
		return 1;
2406
	if (!path->nodes[1])
2407
		return 1;
2408

2409 2410 2411 2412 2413
	right_nritems = btrfs_header_nritems(right);
	if (right_nritems == 0) {
		return 1;
	}

2414 2415
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2416
	left = read_node_slot(root, path->nodes[1], slot - 1);
2417
	btrfs_tree_lock(left);
C
Chris Mason 已提交
2418
	free_space = btrfs_leaf_free_space(root, left);
C
Chris Mason 已提交
2419
	if (free_space < data_size + sizeof(struct btrfs_item)) {
2420 2421
		ret = 1;
		goto out;
2422
	}
C
Chris Mason 已提交
2423 2424

	/* cow and double check */
2425
	ret = btrfs_cow_block(trans, root, left,
2426
			      path->nodes[1], slot - 1, &left, 0);
2427 2428
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2429 2430
		ret = 1;
		goto out;
2431
	}
2432

C
Chris Mason 已提交
2433
	free_space = btrfs_leaf_free_space(root, left);
C
Chris Mason 已提交
2434
	if (free_space < data_size + sizeof(struct btrfs_item)) {
2435 2436
		ret = 1;
		goto out;
C
Chris Mason 已提交
2437 2438
	}

2439 2440 2441 2442 2443 2444
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2445
		item = btrfs_item_nr(right, i);
2446 2447 2448 2449 2450 2451 2452 2453
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2454 2455 2456 2457 2458 2459 2460 2461 2462 2463
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2464 2465
		if (path->slots[0] == i)
			push_space += data_size + sizeof(*item);
2466 2467 2468

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2469
			break;
2470

2471
		push_items++;
2472 2473 2474 2475 2476 2477
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2478
	}
2479

2480
	if (push_items == 0) {
2481 2482
		ret = 1;
		goto out;
2483
	}
2484
	if (!empty && push_items == btrfs_header_nritems(right))
2485
		WARN_ON(1);
2486

2487
	/* push data from right to left */
2488 2489 2490 2491 2492
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2493
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
2494 2495 2496
		     btrfs_item_offset_nr(right, push_items -1);

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2497 2498
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2499
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2500
		     push_space);
2501
	old_left_nritems = btrfs_header_nritems(left);
2502 2503
	BUG_ON(old_left_nritems < 0);

2504
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2505
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2506
		u32 ioff;
2507

2508
		item = btrfs_item_nr(left, i);
2509 2510 2511 2512 2513 2514 2515 2516
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2517 2518
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2519
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2520
	}
2521
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2522 2523 2524 2525
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2526 2527

	/* fixup right node */
2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541
	if (push_items > right_nritems) {
		printk("push items %d nr %u\n", push_items, right_nritems);
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2542 2543 2544
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2545
	}
2546 2547
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2548
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2549 2550
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2566
	}
2567

2568
	btrfs_mark_buffer_dirty(left);
2569 2570
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2571

Z
Zheng Yan 已提交
2572 2573 2574 2575
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2576 2577
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2578 2579
	if (wret)
		ret = wret;
2580 2581 2582 2583

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2584 2585 2586
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2587 2588
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2589 2590
		path->slots[1] -= 1;
	} else {
2591
		btrfs_tree_unlock(left);
2592
		free_extent_buffer(left);
2593 2594
		path->slots[0] -= push_items;
	}
2595
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2596
	return ret;
2597 2598 2599 2600
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2601 2602
}

C
Chris Mason 已提交
2603 2604 2605
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2606 2607
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2608
 */
2609 2610 2611 2612 2613
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2614
{
2615
	struct extent_buffer *l;
2616
	u32 nritems;
2617 2618
	int mid;
	int slot;
2619
	struct extent_buffer *right;
C
Chris Mason 已提交
2620
	int space_needed = data_size + sizeof(struct btrfs_item);
2621 2622 2623
	int data_copy_size;
	int rt_data_off;
	int i;
2624
	int ret = 0;
C
Chris Mason 已提交
2625
	int wret;
2626 2627
	int double_split;
	int num_doubles = 0;
2628
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2629

2630 2631 2632
	if (extend)
		space_needed = data_size;

C
Chris Mason 已提交
2633
	/* first try to make some room by pushing left and right */
2634
	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2635
		wret = push_leaf_right(trans, root, path, data_size, 0);
2636
		if (wret < 0) {
C
Chris Mason 已提交
2637
			return wret;
2638 2639
		}
		if (wret) {
2640
			wret = push_leaf_left(trans, root, path, data_size, 0);
2641 2642 2643 2644
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2645

2646
		/* did the pushes work? */
2647
		if (btrfs_leaf_free_space(root, l) >= space_needed)
2648
			return 0;
2649
	}
C
Chris Mason 已提交
2650

C
Chris Mason 已提交
2651
	if (!path->nodes[1]) {
2652
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2653 2654 2655
		if (ret)
			return ret;
	}
2656 2657 2658
again:
	double_split = 0;
	l = path->nodes[0];
2659
	slot = path->slots[0];
2660
	nritems = btrfs_header_nritems(l);
2661
	mid = (nritems + 1)/ 2;
2662

2663
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2664 2665 2666
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2667 2668
	if (IS_ERR(right)) {
		BUG_ON(1);
2669
		return PTR_ERR(right);
2670
	}
2671 2672

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2673
	btrfs_set_header_bytenr(right, right->start);
2674 2675 2676 2677 2678 2679
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2680 2681 2682 2683

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2684 2685 2686 2687 2688 2689
	if (mid <= slot) {
		if (nritems == 1 ||
		    leaf_space_used(l, mid, nritems - mid) + space_needed >
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2690
				btrfs_set_header_nritems(right, 0);
2691
				wret = insert_ptr(trans, root, path,
2692
						  &disk_key, right->start,
2693 2694 2695
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2696 2697

				btrfs_tree_unlock(path->nodes[0]);
2698 2699
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2700 2701
				path->slots[0] = 0;
				path->slots[1] += 1;
2702
				btrfs_mark_buffer_dirty(right);
2703 2704 2705
				return ret;
			}
			mid = slot;
2706 2707 2708 2709 2710
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
				double_split = 1;
			}
2711 2712 2713 2714
		}
	} else {
		if (leaf_space_used(l, 0, mid + 1) + space_needed >
			BTRFS_LEAF_DATA_SIZE(root)) {
2715
			if (!extend && slot == 0) {
2716
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2717
				btrfs_set_header_nritems(right, 0);
2718 2719
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2720
						  right->start,
2721
						  path->slots[1], 1);
2722 2723
				if (wret)
					ret = wret;
2724
				btrfs_tree_unlock(path->nodes[0]);
2725 2726
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2727
				path->slots[0] = 0;
2728 2729 2730 2731 2732 2733
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
					           path, &disk_key, 1);
					if (wret)
						ret = wret;
				}
2734
				btrfs_mark_buffer_dirty(right);
2735
				return ret;
2736 2737 2738 2739 2740 2741 2742 2743 2744
			} else if (extend && slot == 0) {
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
				    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
					double_split = 1;
				}
2745
			}
2746 2747
		}
	}
2748 2749 2750 2751 2752 2753 2754 2755 2756
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2757 2758 2759
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2760

C
Chris Mason 已提交
2761
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2762
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2763

2764 2765
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2777
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2778
	}
C
Chris Mason 已提交
2779

2780 2781 2782 2783 2784
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2785
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2786
	ret = 0;
2787
	btrfs_item_key(right, &disk_key, 0);
2788 2789
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2790 2791
	if (wret)
		ret = wret;
2792 2793 2794

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2795
	BUG_ON(path->slots[0] != slot);
2796

Z
Zheng Yan 已提交
2797 2798 2799
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2800
	if (mid <= slot) {
2801
		btrfs_tree_unlock(path->nodes[0]);
2802 2803
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2804 2805
		path->slots[0] -= mid;
		path->slots[1] += 1;
2806 2807
	} else {
		btrfs_tree_unlock(right);
2808
		free_extent_buffer(right);
2809
	}
2810

2811
	BUG_ON(path->slots[0] < 0);
2812

2813 2814 2815 2816
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2817
	}
2818 2819 2820
	return ret;
}

C
Chris Mason 已提交
2821 2822 2823 2824 2825 2826
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
2827 2828 2829
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
2830
			u32 new_size, int from_end)
C
Chris Mason 已提交
2831 2832 2833 2834
{
	int ret = 0;
	int slot;
	int slot_orig;
2835 2836
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
2837 2838 2839 2840 2841 2842 2843 2844
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
2845
	leaf = path->nodes[0];
2846 2847 2848 2849 2850
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
2851

2852
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
2853 2854
	data_end = leaf_data_end(root, leaf);

2855
	old_data_start = btrfs_item_offset_nr(leaf, slot);
2856

C
Chris Mason 已提交
2857 2858 2859 2860 2861 2862 2863 2864 2865 2866
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2867 2868
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2869 2870 2871 2872 2873 2874 2875 2876 2877

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

2878 2879
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
2880
	}
2881 2882 2883 2884 2885 2886

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
2887
	/* shift the data */
2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
				        (unsigned long)fi,
				        offsetof(struct btrfs_file_extent_item,
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
2927 2928 2929 2930

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
2931 2932

	ret = 0;
2933 2934
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
2935
		BUG();
2936
	}
C
Chris Mason 已提交
2937 2938 2939
	return ret;
}

C
Chris Mason 已提交
2940 2941 2942
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
2943 2944 2945
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
2946 2947 2948 2949
{
	int ret = 0;
	int slot;
	int slot_orig;
2950 2951
	struct extent_buffer *leaf;
	struct btrfs_item *item;
2952 2953 2954 2955 2956 2957 2958
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
2959
	leaf = path->nodes[0];
2960

2961
	nritems = btrfs_header_nritems(leaf);
2962 2963
	data_end = leaf_data_end(root, leaf);

2964 2965
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
2966
		BUG();
2967
	}
2968
	slot = path->slots[0];
2969
	old_data = btrfs_item_end_nr(leaf, slot);
2970 2971

	BUG_ON(slot < 0);
2972 2973 2974 2975 2976
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
		printk("slot %d too large, nritems %d\n", slot, nritems);
		BUG_ON(1);
	}
2977 2978 2979 2980 2981 2982

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2983 2984
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2985 2986 2987 2988 2989 2990 2991 2992

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
2993 2994
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
2995
	}
2996

2997 2998 2999 3000 3001
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3002
	/* shift the data */
3003
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3004 3005
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3006

3007
	data_end = old_data;
3008 3009 3010 3011
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3012 3013

	ret = 0;
3014 3015
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3016
		BUG();
3017
	}
3018 3019 3020
	return ret;
}

3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int slot_orig;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

	found_key.objectid = 0;
	nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));

	for (i = 0; i < nr; i++)
		total_data += data_size[i];

	total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
	total_size = total_data + (nr * sizeof(struct btrfs_item));
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	slot_orig = path->slots[0];
	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d old_data %d data_end %d\n",
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3172
/*
C
Chris Mason 已提交
3173
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3174 3175
 * This does all the path init required, making room in the tree if needed.
 */
3176
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3177 3178
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3179 3180
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3181
{
3182 3183
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3184
	int ret = 0;
3185
	int slot;
3186
	int slot_orig;
3187
	int i;
3188
	u32 nritems;
3189 3190
	u32 total_size = 0;
	u32 total_data = 0;
3191
	unsigned int data_end;
C
Chris Mason 已提交
3192 3193
	struct btrfs_disk_key disk_key;

3194 3195 3196
	for (i = 0; i < nr; i++) {
		total_data += data_size[i];
	}
3197

3198
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3199
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3200
	if (ret == 0)
3201
		return -EEXIST;
3202 3203
	if (ret < 0)
		goto out;
3204

3205
	slot_orig = path->slots[0];
3206
	leaf = path->nodes[0];
C
Chris Mason 已提交
3207

3208
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3209
	data_end = leaf_data_end(root, leaf);
3210

3211
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3212 3213
		btrfs_print_leaf(root, leaf);
		printk("not enough freespace need %u have %d\n",
3214
		       total_size, btrfs_leaf_free_space(root, leaf));
3215
		BUG();
3216
	}
3217

3218
	slot = path->slots[0];
3219
	BUG_ON(slot < 0);
3220

3221
	if (slot != nritems) {
3222
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3223

3224 3225 3226 3227 3228 3229
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d old_data %d data_end %d\n",
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3230 3231 3232 3233
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3234
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3235
		for (i = slot; i < nritems; i++) {
3236
			u32 ioff;
3237

3238
			item = btrfs_item_nr(leaf, i);
3239 3240 3241 3242 3243 3244 3245 3246
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3247
			ioff = btrfs_item_offset(leaf, item);
3248
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3249
		}
3250 3251 3252 3253
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3254 3255

		/* shift the items */
3256
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3257
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3258
			      (nritems - slot) * sizeof(struct btrfs_item));
3259 3260

		/* shift the data */
3261
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3262
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3263
			      data_end, old_data - data_end);
3264 3265
		data_end = old_data;
	}
3266

3267
	/* setup the item for the new data */
3268 3269 3270 3271 3272 3273 3274 3275 3276
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3277
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3278 3279

	ret = 0;
3280 3281
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3282
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3283
	}
C
Chris Mason 已提交
3284

3285 3286
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3287
		BUG();
3288
	}
3289
out:
3290 3291 3292 3293 3294 3295 3296
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3297 3298 3299
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3300 3301
{
	int ret = 0;
C
Chris Mason 已提交
3302
	struct btrfs_path *path;
3303 3304
	struct extent_buffer *leaf;
	unsigned long ptr;
3305

C
Chris Mason 已提交
3306 3307 3308
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3309
	if (!ret) {
3310 3311 3312 3313
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3314
	}
C
Chris Mason 已提交
3315
	btrfs_free_path(path);
C
Chris Mason 已提交
3316
	return ret;
3317 3318
}

C
Chris Mason 已提交
3319
/*
C
Chris Mason 已提交
3320
 * delete the pointer from a given node.
C
Chris Mason 已提交
3321
 *
C
Chris Mason 已提交
3322 3323
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3324
 */
3325 3326
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3327
{
3328
	struct extent_buffer *parent = path->nodes[level];
3329
	u32 nritems;
C
Chris Mason 已提交
3330
	int ret = 0;
3331
	int wret;
3332

3333
	nritems = btrfs_header_nritems(parent);
3334
	if (slot != nritems -1) {
3335 3336 3337
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3338 3339
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3340
	}
3341
	nritems--;
3342
	btrfs_set_header_nritems(parent, nritems);
3343
	if (nritems == 0 && parent == root->node) {
3344
		BUG_ON(btrfs_header_level(root->node) != 1);
3345
		/* just turn the root into a leaf and break */
3346
		btrfs_set_header_level(root->node, 0);
3347
	} else if (slot == 0) {
3348 3349 3350 3351
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3352 3353
		if (wret)
			ret = wret;
3354
	}
C
Chris Mason 已提交
3355
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3356
	return ret;
3357 3358
}

3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
				path->nodes[1]->start,
				btrfs_header_owner(path->nodes[1]),
3386
				root_gen, 0, 1);
3387 3388
	return ret;
}
C
Chris Mason 已提交
3389 3390 3391 3392
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3393 3394
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3395
{
3396 3397
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3398 3399
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3400 3401
	int ret = 0;
	int wret;
3402
	int i;
3403
	u32 nritems;
3404

3405
	leaf = path->nodes[0];
3406 3407 3408 3409 3410
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3411
	nritems = btrfs_header_nritems(leaf);
3412

3413
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3414
		int data_end = leaf_data_end(root, leaf);
3415 3416

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3417 3418
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3419
			      last_off - data_end);
3420

3421
		for (i = slot + nr; i < nritems; i++) {
3422
			u32 ioff;
3423

3424
			item = btrfs_item_nr(leaf, i);
3425 3426 3427 3428 3429 3430 3431
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3432 3433
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3434
		}
3435 3436 3437 3438 3439 3440

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3441
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3442
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3443
			      sizeof(struct btrfs_item) *
3444
			      (nritems - slot - nr));
3445
	}
3446 3447
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3448

C
Chris Mason 已提交
3449
	/* delete the leaf if we've emptied it */
3450
	if (nritems == 0) {
3451 3452
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3453
		} else {
3454 3455
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3456
		}
3457
	} else {
3458
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3459
		if (slot == 0) {
3460 3461 3462
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3463
			wret = fixup_low_keys(trans, root, path,
3464
					      &disk_key, 1);
C
Chris Mason 已提交
3465 3466 3467 3468
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3469
		/* delete the leaf if it is mostly empty */
3470
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3471 3472 3473 3474
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3475
			slot = path->slots[1];
3476 3477
			extent_buffer_get(leaf);

3478
			wret = push_leaf_left(trans, root, path, 1, 1);
3479
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3480
				ret = wret;
3481 3482 3483

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3484
				wret = push_leaf_right(trans, root, path, 1, 1);
3485
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3486 3487
					ret = wret;
			}
3488 3489

			if (btrfs_header_nritems(leaf) == 0) {
3490 3491 3492
				path->slots[1] = slot;
				ret = btrfs_del_leaf(trans, root, path, leaf->start);
				BUG_ON(ret);
3493
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3494
			} else {
3495 3496 3497 3498 3499 3500 3501
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3502
				free_extent_buffer(leaf);
3503
			}
3504
		} else {
3505
			btrfs_mark_buffer_dirty(leaf);
3506 3507
		}
	}
C
Chris Mason 已提交
3508
	return ret;
3509 3510
}

3511
/*
3512
 * search the tree again to find a leaf with lesser keys
3513 3514
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3515 3516 3517
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3518 3519 3520
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3521 3522 3523
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3524

3525
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3526

3527 3528 3529 3530 3531 3532 3533 3534
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3535

3536 3537 3538 3539 3540 3541 3542 3543 3544
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3545 3546
}

3547 3548 3549
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3550
 * transaction id.  This is used by the btree defrag code, and tree logging
3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3562 3563 3564 3565
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3566 3567 3568 3569
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3570
			 struct btrfs_key *max_key,
3571 3572 3573 3574 3575 3576
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3577
	int sret;
3578 3579 3580 3581 3582 3583 3584
	u32 nritems;
	int level;
	int ret = 1;

again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3585
	WARN_ON(path->nodes[level]);
3586 3587 3588 3589 3590 3591 3592 3593 3594 3595
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
	while(1) {
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3596
		sret = bin_search(cur, min_key, level, &slot);
3597

3598 3599
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3600 3601
			if (slot >= nritems)
				goto find_next_key;
3602 3603 3604 3605 3606
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3607 3608
		if (sret && slot > 0)
			slot--;
3609 3610 3611 3612 3613 3614 3615 3616 3617
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
		while(slot < nritems) {
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3618 3619
			struct btrfs_disk_key disk_key;

3620 3621 3622 3623 3624 3625 3626 3627 3628
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3629 3630 3631 3632 3633 3634 3635 3636
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3648
find_next_key:
3649 3650 3651 3652 3653
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3654 3655
			path->slots[level] = slot;
			sret = btrfs_find_next_key(root, path, min_key, level,
3656
						  cache_only, min_trans);
3657
			if (sret == 0) {
3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3697
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3698 3699
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

	while(level < BTRFS_MAX_LEVEL) {
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
3711
next:
3712 3713 3714 3715 3716 3717 3718 3719 3720
		if (slot >= btrfs_header_nritems(c)) {
			level++;
			if (level == BTRFS_MAX_LEVEL) {
				return 1;
			}
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
3741
			btrfs_node_key_to_cpu(c, key, slot);
3742
		}
3743 3744 3745 3746 3747
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
3748
/*
3749
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
3750 3751
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3752
 */
C
Chris Mason 已提交
3753
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3754 3755 3756
{
	int slot;
	int level = 1;
3757 3758
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
	if (nritems == 0) {
		return 1;
	}

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
3771
	path->keep_locks = 1;
3772 3773 3774 3775 3776 3777
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

3778
	nritems = btrfs_header_nritems(path->nodes[0]);
3779 3780 3781 3782 3783 3784
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
3785
	if (nritems > 0 && path->slots[0] < nritems - 1) {
3786
		path->slots[0]++;
3787 3788
		goto done;
	}
3789

C
Chris Mason 已提交
3790
	while(level < BTRFS_MAX_LEVEL) {
3791
		if (!path->nodes[level])
C
Chris Mason 已提交
3792
			return 1;
3793

3794 3795
		slot = path->slots[level] + 1;
		c = path->nodes[level];
3796
		if (slot >= btrfs_header_nritems(c)) {
3797
			level++;
3798
			if (level == BTRFS_MAX_LEVEL) {
3799
				return 1;
3800
			}
3801 3802
			continue;
		}
3803

3804 3805
		if (next) {
			btrfs_tree_unlock(next);
3806
			free_extent_buffer(next);
3807
		}
3808

3809 3810
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
3811
			reada_for_search(root, path, level, slot, 0);
3812

3813
		next = read_node_slot(root, c, slot);
3814 3815 3816 3817
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(c));
			btrfs_tree_lock(next);
		}
3818 3819 3820 3821 3822 3823
		break;
	}
	path->slots[level] = slot;
	while(1) {
		level--;
		c = path->nodes[level];
3824 3825
		if (path->locks[level])
			btrfs_tree_unlock(c);
3826
		free_extent_buffer(c);
3827 3828
		path->nodes[level] = next;
		path->slots[level] = 0;
3829 3830
		if (!path->skip_locking)
			path->locks[level] = 1;
3831 3832
		if (!level)
			break;
3833 3834
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
3835
		next = read_node_slot(root, next, 0);
3836 3837 3838 3839
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
			btrfs_tree_lock(next);
		}
3840
	}
3841 3842
done:
	unlock_up(path, 0, 1);
3843 3844
	return 0;
}
3845

3846 3847 3848 3849 3850 3851
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
3852 3853 3854 3855 3856 3857
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
3858
	u32 nritems;
3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869
	int ret;

	while(1) {
		if (path->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
3870 3871 3872 3873 3874 3875
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

3876 3877 3878
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
3879 3880 3881 3882 3883
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
3884 3885 3886
	}
	return 1;
}