ctree.c 100.1 KB
Newer Older
C
Chris Mason 已提交
1
/*
C
Chris Mason 已提交
2
 * Copyright (C) 2007,2008 Oracle.  All rights reserved.
C
Chris Mason 已提交
3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License v2 as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

19
#include <linux/sched.h>
20 21
#include "ctree.h"
#include "disk-io.h"
22
#include "transaction.h"
23
#include "print-tree.h"
24
#include "locking.h"
25

26 27 28
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, int level);
static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
29
		      *root, struct btrfs_key *ins_key,
30
		      struct btrfs_path *path, int data_size, int extend);
31 32
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
33
			  struct extent_buffer *src, int empty);
34 35 36 37
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst_buf,
			      struct extent_buffer *src_buf);
38 39
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot);
40

C
Chris Mason 已提交
41
inline void btrfs_init_path(struct btrfs_path *p)
C
Chris Mason 已提交
42
{
C
Chris Mason 已提交
43
	memset(p, 0, sizeof(*p));
C
Chris Mason 已提交
44 45
}

C
Chris Mason 已提交
46
struct btrfs_path *btrfs_alloc_path(void)
C
Chris Mason 已提交
47
{
C
Chris Mason 已提交
48 49
	struct btrfs_path *path;
	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
50
	if (path) {
C
Chris Mason 已提交
51
		btrfs_init_path(path);
52 53
		path->reada = 1;
	}
C
Chris Mason 已提交
54
	return path;
C
Chris Mason 已提交
55 56
}

C
Chris Mason 已提交
57
/* this also releases the path */
C
Chris Mason 已提交
58
void btrfs_free_path(struct btrfs_path *p)
59
{
C
Chris Mason 已提交
60 61
	btrfs_release_path(NULL, p);
	kmem_cache_free(btrfs_path_cachep, p);
62 63
}

C
Chris Mason 已提交
64 65 66 67 68 69
/*
 * path release drops references on the extent buffers in the path
 * and it drops any locks held by this path
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
70
void noinline btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
71 72
{
	int i;
73

C
Chris Mason 已提交
74
	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
75
		p->slots[i] = 0;
76
		if (!p->nodes[i])
77 78 79 80 81
			continue;
		if (p->locks[i]) {
			btrfs_tree_unlock(p->nodes[i]);
			p->locks[i] = 0;
		}
82
		free_extent_buffer(p->nodes[i]);
83
		p->nodes[i] = NULL;
84 85 86
	}
}

C
Chris Mason 已提交
87 88 89 90 91 92 93 94 95 96
/*
 * safely gets a reference on the root node of a tree.  A lock
 * is not taken, so a concurrent writer may put a different node
 * at the root of the tree.  See btrfs_lock_root_node for the
 * looping required.
 *
 * The extent buffer returned by this has a reference taken, so
 * it won't disappear.  It may stop being the root of the tree
 * at any time because there are no locks held.
 */
97 98 99 100 101 102 103 104 105 106
struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;
	spin_lock(&root->node_lock);
	eb = root->node;
	extent_buffer_get(eb);
	spin_unlock(&root->node_lock);
	return eb;
}

C
Chris Mason 已提交
107 108 109 110
/* loop around taking references on and locking the root node of the
 * tree until you end up with a lock on the root.  A locked buffer
 * is returned, with a reference held.
 */
111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
{
	struct extent_buffer *eb;

	while(1) {
		eb = btrfs_root_node(root);
		btrfs_tree_lock(eb);

		spin_lock(&root->node_lock);
		if (eb == root->node) {
			spin_unlock(&root->node_lock);
			break;
		}
		spin_unlock(&root->node_lock);

		btrfs_tree_unlock(eb);
		free_extent_buffer(eb);
	}
	return eb;
}

C
Chris Mason 已提交
132 133 134 135
/* cowonly root (everything not a reference counted cow subvolume), just get
 * put onto a simple dirty list.  transaction.c walks this to make sure they
 * get properly updated on disk.
 */
136 137 138 139 140 141 142 143
static void add_root_to_dirty_list(struct btrfs_root *root)
{
	if (root->track_dirty && list_empty(&root->dirty_list)) {
		list_add(&root->dirty_list,
			 &root->fs_info->dirty_cowonly_roots);
	}
}

C
Chris Mason 已提交
144 145 146 147 148
/*
 * used by snapshot creation to make a copy of a root for a tree with
 * a given objectid.  The buffer with the new root node is returned in
 * cow_ret, and this func returns zero on success or a negative error code.
 */
149 150 151 152 153 154 155 156 157
int btrfs_copy_root(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root,
		      struct extent_buffer *buf,
		      struct extent_buffer **cow_ret, u64 new_root_objectid)
{
	struct extent_buffer *cow;
	u32 nritems;
	int ret = 0;
	int level;
158
	struct btrfs_root *new_root;
159

160 161 162 163 164 165
	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
	if (!new_root)
		return -ENOMEM;

	memcpy(new_root, root, sizeof(*new_root));
	new_root->root_key.objectid = new_root_objectid;
166 167 168 169 170 171 172

	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);

	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
173 174 175 176

	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
				     new_root_objectid, trans->transid,
				     level, buf->start, 0);
177 178
	if (IS_ERR(cow)) {
		kfree(new_root);
179
		return PTR_ERR(cow);
180
	}
181 182 183 184 185

	copy_extent_buffer(cow, buf, 0, 0, cow->len);
	btrfs_set_header_bytenr(cow, cow->start);
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, new_root_objectid);
186
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
187

Y
Yan Zheng 已提交
188 189 190 191
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

192
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
Z
Zheng Yan 已提交
193
	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
194 195
	kfree(new_root);

196 197 198 199 200 201 202 203
	if (ret)
		return ret;

	btrfs_mark_buffer_dirty(cow);
	*cow_ret = cow;
	return 0;
}

C
Chris Mason 已提交
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
/*
 * does the dirty work in cow of a single block.  The parent block
 * (if supplied) is updated to point to the new cow copy.  The new
 * buffer is marked dirty and returned locked.  If you modify the block
 * it needs to be marked dirty again.
 *
 * search_start -- an allocation hint for the new block
 *
 * empty_size -- a hint that you plan on doing more cow.  This is the size in bytes
 * the allocator should try to find free next to the block it returns.  This is
 * just a hint and may be ignored by the allocator.
 *
 * prealloc_dest -- if you have already reserved a destination for the cow,
 * this uses that block instead of allocating a new one.  btrfs_alloc_reserved_extent
 * is used to finish the allocation.
 */
220
int noinline __btrfs_cow_block(struct btrfs_trans_handle *trans,
221 222 223 224
			     struct btrfs_root *root,
			     struct extent_buffer *buf,
			     struct extent_buffer *parent, int parent_slot,
			     struct extent_buffer **cow_ret,
225 226
			     u64 search_start, u64 empty_size,
			     u64 prealloc_dest)
C
Chris Mason 已提交
227
{
Z
Zheng Yan 已提交
228
	u64 parent_start;
229
	struct extent_buffer *cow;
230
	u32 nritems;
231
	int ret = 0;
232
	int level;
233
	int unlock_orig = 0;
234

235 236 237 238 239
	if (*cow_ret == buf)
		unlock_orig = 1;

	WARN_ON(!btrfs_tree_locked(buf));

Z
Zheng Yan 已提交
240 241 242 243 244
	if (parent)
		parent_start = parent->start;
	else
		parent_start = 0;

245 246
	WARN_ON(root->ref_cows && trans->transid !=
		root->fs_info->running_transaction->transid);
247
	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
248

249 250
	level = btrfs_header_level(buf);
	nritems = btrfs_header_nritems(buf);
Z
Zheng Yan 已提交
251

252 253 254 255 256 257 258
	if (prealloc_dest) {
		struct btrfs_key ins;

		ins.objectid = prealloc_dest;
		ins.offset = buf->len;
		ins.type = BTRFS_EXTENT_ITEM_KEY;

Z
Zheng Yan 已提交
259
		ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
260
						  root->root_key.objectid,
261
						  trans->transid, level, &ins);
262 263 264 265 266
		BUG_ON(ret);
		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
					    buf->len);
	} else {
		cow = btrfs_alloc_free_block(trans, root, buf->len,
Z
Zheng Yan 已提交
267
					     parent_start,
268
					     root->root_key.objectid,
Z
Zheng Yan 已提交
269 270
					     trans->transid, level,
					     search_start, empty_size);
271
	}
272 273
	if (IS_ERR(cow))
		return PTR_ERR(cow);
274

275
	copy_extent_buffer(cow, buf, 0, 0, cow->len);
276
	btrfs_set_header_bytenr(cow, cow->start);
277 278
	btrfs_set_header_generation(cow, trans->transid);
	btrfs_set_header_owner(cow, root->root_key.objectid);
279
	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
280

Y
Yan Zheng 已提交
281 282 283 284
	write_extent_buffer(cow, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(cow),
			    BTRFS_FSID_SIZE);

285 286
	WARN_ON(btrfs_header_generation(buf) > trans->transid);
	if (btrfs_header_generation(buf) != trans->transid) {
Z
Zheng Yan 已提交
287 288
		u32 nr_extents;
		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
289 290
		if (ret)
			return ret;
Z
Zheng Yan 已提交
291 292 293

		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
		WARN_ON(ret);
Z
Zheng Yan 已提交
294 295 296 297
	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
		/*
		 * There are only two places that can drop reference to
		 * tree blocks owned by living reloc trees, one is here,
Y
Yan Zheng 已提交
298
		 * the other place is btrfs_drop_subtree. In both places,
Z
Zheng Yan 已提交
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
		 * we check reference count while tree block is locked.
		 * Furthermore, if reference count is one, it won't get
		 * increased by someone else.
		 */
		u32 refs;
		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
					      buf->len, &refs);
		BUG_ON(ret);
		if (refs == 1) {
			ret = btrfs_update_ref(trans, root, buf, cow,
					       0, nritems);
			clean_tree_block(trans, root, buf);
		} else {
			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
		}
		BUG_ON(ret);
315
	} else {
Z
Zheng Yan 已提交
316 317 318
		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
		if (ret)
			return ret;
319 320 321
		clean_tree_block(trans, root, buf);
	}

Z
Zheng Yan 已提交
322 323 324 325 326
	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
		WARN_ON(ret);
	}

C
Chris Mason 已提交
327
	if (buf == root->node) {
328 329 330
		WARN_ON(parent && parent != buf);

		spin_lock(&root->node_lock);
C
Chris Mason 已提交
331
		root->node = cow;
332
		extent_buffer_get(cow);
333 334
		spin_unlock(&root->node_lock);

C
Chris Mason 已提交
335
		if (buf != root->commit_root) {
336
			btrfs_free_extent(trans, root, buf->start,
Z
Zheng Yan 已提交
337 338 339
					  buf->len, buf->start,
					  root->root_key.objectid,
					  btrfs_header_generation(buf),
340
					  level, 1);
C
Chris Mason 已提交
341
		}
342
		free_extent_buffer(buf);
343
		add_root_to_dirty_list(root);
C
Chris Mason 已提交
344
	} else {
345
		btrfs_set_node_blockptr(parent, parent_slot,
346
					cow->start);
347 348 349
		WARN_ON(trans->transid == 0);
		btrfs_set_node_ptr_generation(parent, parent_slot,
					      trans->transid);
C
Chris Mason 已提交
350
		btrfs_mark_buffer_dirty(parent);
351
		WARN_ON(btrfs_header_generation(parent) != trans->transid);
352
		btrfs_free_extent(trans, root, buf->start, buf->len,
Z
Zheng Yan 已提交
353
				  parent_start, btrfs_header_owner(parent),
354
				  btrfs_header_generation(parent), level, 1);
C
Chris Mason 已提交
355
	}
356 357
	if (unlock_orig)
		btrfs_tree_unlock(buf);
358
	free_extent_buffer(buf);
C
Chris Mason 已提交
359
	btrfs_mark_buffer_dirty(cow);
C
Chris Mason 已提交
360
	*cow_ret = cow;
C
Chris Mason 已提交
361 362 363
	return 0;
}

C
Chris Mason 已提交
364 365 366 367 368
/*
 * cows a single block, see __btrfs_cow_block for the real work.
 * This version of it has extra checks so that a block isn't cow'd more than
 * once per transaction, as long as it hasn't been written yet
 */
369
int noinline btrfs_cow_block(struct btrfs_trans_handle *trans,
370 371
		    struct btrfs_root *root, struct extent_buffer *buf,
		    struct extent_buffer *parent, int parent_slot,
372
		    struct extent_buffer **cow_ret, u64 prealloc_dest)
373 374
{
	u64 search_start;
375
	int ret;
C
Chris Mason 已提交
376

377 378 379 380 381 382 383 384 385 386
	if (trans->transaction != root->fs_info->running_transaction) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->generation);
		WARN_ON(1);
	}
C
Chris Mason 已提交
387

388
	spin_lock(&root->fs_info->hash_lock);
389 390
	if (btrfs_header_generation(buf) == trans->transid &&
	    btrfs_header_owner(buf) == root->root_key.objectid &&
391
	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
392
		*cow_ret = buf;
393
		spin_unlock(&root->fs_info->hash_lock);
394
		WARN_ON(prealloc_dest);
395 396
		return 0;
	}
397
	spin_unlock(&root->fs_info->hash_lock);
398
	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
399
	ret = __btrfs_cow_block(trans, root, buf, parent,
400 401
				 parent_slot, cow_ret, search_start, 0,
				 prealloc_dest);
402
	return ret;
403 404
}

C
Chris Mason 已提交
405 406 407 408
/*
 * helper function for defrag to decide if two blocks pointed to by a
 * node are actually close by
 */
409
static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
410
{
411
	if (blocknr < other && other - (blocknr + blocksize) < 32768)
412
		return 1;
413
	if (blocknr > other && blocknr - (other + blocksize) < 32768)
414 415 416 417
		return 1;
	return 0;
}

418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
/*
 * compare two keys in a memcmp fashion
 */
static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
{
	struct btrfs_key k1;

	btrfs_disk_key_to_cpu(&k1, disk);

	if (k1.objectid > k2->objectid)
		return 1;
	if (k1.objectid < k2->objectid)
		return -1;
	if (k1.type > k2->type)
		return 1;
	if (k1.type < k2->type)
		return -1;
	if (k1.offset > k2->offset)
		return 1;
	if (k1.offset < k2->offset)
		return -1;
	return 0;
}

442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460
/*
 * same as comp_keys only with two btrfs_key's
 */
static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
{
	if (k1->objectid > k2->objectid)
		return 1;
	if (k1->objectid < k2->objectid)
		return -1;
	if (k1->type > k2->type)
		return 1;
	if (k1->type < k2->type)
		return -1;
	if (k1->offset > k2->offset)
		return 1;
	if (k1->offset < k2->offset)
		return -1;
	return 0;
}
461

C
Chris Mason 已提交
462 463 464 465 466
/*
 * this is used by the defrag code to go through all the
 * leaves pointed to by a node and reallocate them so that
 * disk order is close to key order
 */
467
int btrfs_realloc_node(struct btrfs_trans_handle *trans,
468
		       struct btrfs_root *root, struct extent_buffer *parent,
469 470
		       int start_slot, int cache_only, u64 *last_ret,
		       struct btrfs_key *progress)
471
{
472
	struct extent_buffer *cur;
473
	u64 blocknr;
474
	u64 gen;
475 476
	u64 search_start = *last_ret;
	u64 last_block = 0;
477 478 479 480 481
	u64 other;
	u32 parent_nritems;
	int end_slot;
	int i;
	int err = 0;
482
	int parent_level;
483 484
	int uptodate;
	u32 blocksize;
485 486
	int progress_passed = 0;
	struct btrfs_disk_key disk_key;
487

488 489 490 491
	parent_level = btrfs_header_level(parent);
	if (cache_only && parent_level != 1)
		return 0;

492 493 494 495 496 497 498 499 500 501
	if (trans->transaction != root->fs_info->running_transaction) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->running_transaction->transid);
		WARN_ON(1);
	}
	if (trans->transid != root->fs_info->generation) {
		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
		       root->fs_info->generation);
		WARN_ON(1);
	}
502

503 504
	parent_nritems = btrfs_header_nritems(parent);
	blocksize = btrfs_level_size(root, parent_level - 1);
505 506 507 508 509 510 511
	end_slot = parent_nritems;

	if (parent_nritems == 1)
		return 0;

	for (i = start_slot; i < end_slot; i++) {
		int close = 1;
512

513 514 515 516 517 518 519 520
		if (!parent->map_token) {
			map_extent_buffer(parent,
					btrfs_node_key_ptr_offset(i),
					sizeof(struct btrfs_key_ptr),
					&parent->map_token, &parent->kaddr,
					&parent->map_start, &parent->map_len,
					KM_USER1);
		}
521 522 523 524 525
		btrfs_node_key(parent, &disk_key, i);
		if (!progress_passed && comp_keys(&disk_key, progress) < 0)
			continue;

		progress_passed = 1;
526
		blocknr = btrfs_node_blockptr(parent, i);
527
		gen = btrfs_node_ptr_generation(parent, i);
528 529
		if (last_block == 0)
			last_block = blocknr;
530

531
		if (i > 0) {
532 533
			other = btrfs_node_blockptr(parent, i - 1);
			close = close_blocks(blocknr, other, blocksize);
534
		}
C
Chris Mason 已提交
535
		if (!close && i < end_slot - 2) {
536 537
			other = btrfs_node_blockptr(parent, i + 1);
			close = close_blocks(blocknr, other, blocksize);
538
		}
539 540
		if (close) {
			last_block = blocknr;
541
			continue;
542
		}
543 544 545 546 547
		if (parent->map_token) {
			unmap_extent_buffer(parent, parent->map_token,
					    KM_USER1);
			parent->map_token = NULL;
		}
548

549 550
		cur = btrfs_find_tree_block(root, blocknr, blocksize);
		if (cur)
551
			uptodate = btrfs_buffer_uptodate(cur, gen);
552 553
		else
			uptodate = 0;
554
		if (!cur || !uptodate) {
555
			if (cache_only) {
556
				free_extent_buffer(cur);
557 558
				continue;
			}
559 560
			if (!cur) {
				cur = read_tree_block(root, blocknr,
561
							 blocksize, gen);
562
			} else if (!uptodate) {
563
				btrfs_read_buffer(cur, gen);
564
			}
565
		}
566
		if (search_start == 0)
567
			search_start = last_block;
568

569
		btrfs_tree_lock(cur);
570
		err = __btrfs_cow_block(trans, root, cur, parent, i,
571
					&cur, search_start,
572
					min(16 * blocksize,
573
					    (end_slot - i) * blocksize), 0);
Y
Yan 已提交
574
		if (err) {
575
			btrfs_tree_unlock(cur);
576
			free_extent_buffer(cur);
577
			break;
Y
Yan 已提交
578
		}
579 580
		search_start = cur->start;
		last_block = cur->start;
581
		*last_ret = search_start;
582 583
		btrfs_tree_unlock(cur);
		free_extent_buffer(cur);
584
	}
585 586 587 588 589
	if (parent->map_token) {
		unmap_extent_buffer(parent, parent->map_token,
				    KM_USER1);
		parent->map_token = NULL;
	}
590 591 592
	return err;
}

C
Chris Mason 已提交
593 594 595 596 597
/*
 * The leaf data grows from end-to-front in the node.
 * this returns the address of the start of the last item,
 * which is the stop of the leaf data stack
 */
C
Chris Mason 已提交
598
static inline unsigned int leaf_data_end(struct btrfs_root *root,
599
					 struct extent_buffer *leaf)
600
{
601
	u32 nr = btrfs_header_nritems(leaf);
602
	if (nr == 0)
C
Chris Mason 已提交
603
		return BTRFS_LEAF_DATA_SIZE(root);
604
	return btrfs_item_offset_nr(leaf, nr - 1);
605 606
}

C
Chris Mason 已提交
607 608 609 610
/*
 * extra debugging checks to make sure all the items in a key are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
611 612
static int check_node(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
613
{
614 615 616 617
	struct extent_buffer *parent = NULL;
	struct extent_buffer *node = path->nodes[level];
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key node_key;
C
Chris Mason 已提交
618
	int parent_slot;
619 620
	int slot;
	struct btrfs_key cpukey;
621
	u32 nritems = btrfs_header_nritems(node);
C
Chris Mason 已提交
622 623

	if (path->nodes[level + 1])
624
		parent = path->nodes[level + 1];
A
Aneesh 已提交
625

626
	slot = path->slots[level];
627 628
	BUG_ON(nritems == 0);
	if (parent) {
A
Aneesh 已提交
629
		parent_slot = path->slots[level + 1];
630 631 632
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_node_key(node, &node_key, 0);
		BUG_ON(memcmp(&parent_key, &node_key,
C
Chris Mason 已提交
633
			      sizeof(struct btrfs_disk_key)));
634
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
635
		       btrfs_header_bytenr(node));
C
Chris Mason 已提交
636
	}
C
Chris Mason 已提交
637
	BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
638
	if (slot != 0) {
639 640 641
		btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
642 643
	}
	if (slot < nritems - 1) {
644 645 646
		btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
		btrfs_node_key(node, &node_key, slot);
		BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
C
Chris Mason 已提交
647 648 649 650
	}
	return 0;
}

C
Chris Mason 已提交
651 652 653 654
/*
 * extra checking to make sure all the items in a leaf are
 * well formed and in the proper order
 */
C
Chris Mason 已提交
655 656
static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
		      int level)
C
Chris Mason 已提交
657
{
658 659
	struct extent_buffer *leaf = path->nodes[level];
	struct extent_buffer *parent = NULL;
C
Chris Mason 已提交
660
	int parent_slot;
661
	struct btrfs_key cpukey;
662 663 664
	struct btrfs_disk_key parent_key;
	struct btrfs_disk_key leaf_key;
	int slot = path->slots[0];
665

666
	u32 nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
667 668

	if (path->nodes[level + 1])
669
		parent = path->nodes[level + 1];
670 671 672 673 674

	if (nritems == 0)
		return 0;

	if (parent) {
A
Aneesh 已提交
675
		parent_slot = path->slots[level + 1];
676 677
		btrfs_node_key(parent, &parent_key, parent_slot);
		btrfs_item_key(leaf, &leaf_key, 0);
678

679
		BUG_ON(memcmp(&parent_key, &leaf_key,
C
Chris Mason 已提交
680
		       sizeof(struct btrfs_disk_key)));
681
		BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
682
		       btrfs_header_bytenr(leaf));
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
	}
#if 0
	for (i = 0; nritems > 1 && i < nritems - 2; i++) {
		btrfs_item_key_to_cpu(leaf, &cpukey, i + 1);
		btrfs_item_key(leaf, &leaf_key, i);
		if (comp_keys(&leaf_key, &cpukey) >= 0) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad key\n", i);
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, i) !=
			btrfs_item_end_nr(leaf, i + 1)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", i);
			BUG_ON(1);
		}
		if (i == 0) {
			if (btrfs_item_offset_nr(leaf, i) +
			       btrfs_item_size_nr(leaf, i) !=
			       BTRFS_LEAF_DATA_SIZE(root)) {
				btrfs_print_leaf(root, leaf);
				printk("slot %d first offset bad\n", i);
				BUG_ON(1);
			}
		}
C
Chris Mason 已提交
708
	}
709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730
	if (nritems > 0) {
		if (btrfs_item_size_nr(leaf, nritems - 1) > 4096) {
				btrfs_print_leaf(root, leaf);
				printk("slot %d bad size \n", nritems - 1);
				BUG_ON(1);
		}
	}
#endif
	if (slot != 0 && slot < nritems - 1) {
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
		if (comp_keys(&leaf_key, &cpukey) <= 0) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad key\n", slot);
			BUG_ON(1);
		}
		if (btrfs_item_offset_nr(leaf, slot - 1) !=
		       btrfs_item_end_nr(leaf, slot)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", slot);
			BUG_ON(1);
		}
731 732
	}
	if (slot < nritems - 1) {
733 734 735 736 737 738 739 740 741
		btrfs_item_key(leaf, &leaf_key, slot);
		btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
		BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
		if (btrfs_item_offset_nr(leaf, slot) !=
			btrfs_item_end_nr(leaf, slot + 1)) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d offset bad\n", slot);
			BUG_ON(1);
		}
C
Chris Mason 已提交
742
	}
743 744
	BUG_ON(btrfs_item_offset_nr(leaf, 0) +
	       btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
C
Chris Mason 已提交
745 746 747
	return 0;
}

748 749
static int noinline check_block(struct btrfs_root *root,
				struct btrfs_path *path, int level)
C
Chris Mason 已提交
750
{
751
	u64 found_start;
752
	return 0;
753 754 755 756 757 758 759 760 761
	if (btrfs_header_level(path->nodes[level]) != level)
	    printk("warning: bad level %Lu wanted %d found %d\n",
		   path->nodes[level]->start, level,
		   btrfs_header_level(path->nodes[level]));
	found_start = btrfs_header_bytenr(path->nodes[level]);
	if (found_start != path->nodes[level]->start) {
	    printk("warning: bad bytentr %Lu found %Lu\n",
		   path->nodes[level]->start, found_start);
	}
762
#if 0
763 764
	struct extent_buffer *buf = path->nodes[level];

765 766 767
	if (memcmp_extent_buffer(buf, root->fs_info->fsid,
				 (unsigned long)btrfs_header_fsid(buf),
				 BTRFS_FSID_SIZE)) {
768
		printk("warning bad block %Lu\n", buf->start);
769
		return 1;
770
	}
771
#endif
C
Chris Mason 已提交
772
	if (level == 0)
C
Chris Mason 已提交
773 774
		return check_leaf(root, path, level);
	return check_node(root, path, level);
C
Chris Mason 已提交
775 776
}

C
Chris Mason 已提交
777
/*
778 779 780
 * search for key in the extent_buffer.  The items start at offset p,
 * and they are item_size apart.  There are 'max' items in p.
 *
C
Chris Mason 已提交
781 782 783 784 785 786
 * the slot in the array is returned via slot, and it points to
 * the place where you would insert key if it is not found in
 * the array.
 *
 * slot may point to max if the key is bigger than all of the keys
 */
787 788 789 790
static noinline int generic_bin_search(struct extent_buffer *eb,
				       unsigned long p,
				       int item_size, struct btrfs_key *key,
				       int max, int *slot)
791 792 793 794 795
{
	int low = 0;
	int high = max;
	int mid;
	int ret;
796
	struct btrfs_disk_key *tmp = NULL;
797 798 799 800 801 802
	struct btrfs_disk_key unaligned;
	unsigned long offset;
	char *map_token = NULL;
	char *kaddr = NULL;
	unsigned long map_start = 0;
	unsigned long map_len = 0;
803
	int err;
804 805 806

	while(low < high) {
		mid = (low + high) / 2;
807 808 809 810 811
		offset = p + mid * item_size;

		if (!map_token || offset < map_start ||
		    (offset + sizeof(struct btrfs_disk_key)) >
		    map_start + map_len) {
812
			if (map_token) {
813
				unmap_extent_buffer(eb, map_token, KM_USER0);
814 815 816 817 818 819 820 821 822 823 824 825 826 827 828
				map_token = NULL;
			}
			err = map_extent_buffer(eb, offset,
						sizeof(struct btrfs_disk_key),
						&map_token, &kaddr,
						&map_start, &map_len, KM_USER0);

			if (!err) {
				tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
			} else {
				read_extent_buffer(eb, &unaligned,
						   offset, sizeof(unaligned));
				tmp = &unaligned;
			}
829 830 831 832 833

		} else {
			tmp = (struct btrfs_disk_key *)(kaddr + offset -
							map_start);
		}
834 835 836 837 838 839 840 841
		ret = comp_keys(tmp, key);

		if (ret < 0)
			low = mid + 1;
		else if (ret > 0)
			high = mid;
		else {
			*slot = mid;
842 843
			if (map_token)
				unmap_extent_buffer(eb, map_token, KM_USER0);
844 845 846 847
			return 0;
		}
	}
	*slot = low;
848 849
	if (map_token)
		unmap_extent_buffer(eb, map_token, KM_USER0);
850 851 852
	return 1;
}

C
Chris Mason 已提交
853 854 855 856
/*
 * simple bin_search frontend that does the right thing for
 * leaves vs nodes
 */
857 858
static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
		      int level, int *slot)
859
{
860 861 862
	if (level == 0) {
		return generic_bin_search(eb,
					  offsetof(struct btrfs_leaf, items),
C
Chris Mason 已提交
863
					  sizeof(struct btrfs_item),
864
					  key, btrfs_header_nritems(eb),
865
					  slot);
866
	} else {
867 868
		return generic_bin_search(eb,
					  offsetof(struct btrfs_node, ptrs),
C
Chris Mason 已提交
869
					  sizeof(struct btrfs_key_ptr),
870
					  key, btrfs_header_nritems(eb),
871
					  slot);
872 873 874 875
	}
	return -1;
}

C
Chris Mason 已提交
876 877 878 879
/* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
 */
880
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
881
				   struct extent_buffer *parent, int slot)
882
{
883
	int level = btrfs_header_level(parent);
884 885
	if (slot < 0)
		return NULL;
886
	if (slot >= btrfs_header_nritems(parent))
887
		return NULL;
888 889 890

	BUG_ON(level == 0);

891
	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
892 893
		       btrfs_level_size(root, level - 1),
		       btrfs_node_ptr_generation(parent, slot));
894 895
}

C
Chris Mason 已提交
896 897 898 899 900
/*
 * node level balancing, used to make sure nodes are in proper order for
 * item deletion.  We balance from the top down, so we have to make sure
 * that a deletion won't leave an node completely empty later on.
 */
901
static noinline int balance_level(struct btrfs_trans_handle *trans,
902 903
			 struct btrfs_root *root,
			 struct btrfs_path *path, int level)
904
{
905 906 907 908
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
909 910 911 912
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
913
	int err_on_enospc = 0;
914
	u64 orig_ptr;
915 916 917 918

	if (level == 0)
		return 0;

919
	mid = path->nodes[level];
920
	WARN_ON(!path->locks[level]);
921 922
	WARN_ON(btrfs_header_generation(mid) != trans->transid);

923
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
924

C
Chris Mason 已提交
925
	if (level < BTRFS_MAX_LEVEL - 1)
926
		parent = path->nodes[level + 1];
927 928
	pslot = path->slots[level + 1];

C
Chris Mason 已提交
929 930 931 932
	/*
	 * deal with the case where there is only one pointer in the root
	 * by promoting the node below to a root
	 */
933 934
	if (!parent) {
		struct extent_buffer *child;
935

936
		if (btrfs_header_nritems(mid) != 1)
937 938 939
			return 0;

		/* promote the child to a root */
940
		child = read_node_slot(root, mid, 0);
941
		btrfs_tree_lock(child);
942
		BUG_ON(!child);
943
		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
944 945
		BUG_ON(ret);

946
		spin_lock(&root->node_lock);
947
		root->node = child;
948 949
		spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
950 951 952
		ret = btrfs_update_extent_ref(trans, root, child->start,
					      mid->start, child->start,
					      root->root_key.objectid,
953
					      trans->transid, level - 1);
Z
Zheng Yan 已提交
954 955
		BUG_ON(ret);

956
		add_root_to_dirty_list(root);
957 958
		btrfs_tree_unlock(child);
		path->locks[level] = 0;
959
		path->nodes[level] = NULL;
960
		clean_tree_block(trans, root, mid);
961
		btrfs_tree_unlock(mid);
962
		/* once for the path */
963
		free_extent_buffer(mid);
964
		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
Z
Zheng Yan 已提交
965
					mid->start, root->root_key.objectid,
966 967
					btrfs_header_generation(mid),
					level, 1);
968
		/* once for the root ptr */
969
		free_extent_buffer(mid);
970
		return ret;
971
	}
972
	if (btrfs_header_nritems(mid) >
C
Chris Mason 已提交
973
	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
974 975
		return 0;

976
	if (btrfs_header_nritems(mid) < 2)
977 978
		err_on_enospc = 1;

979 980
	left = read_node_slot(root, parent, pslot - 1);
	if (left) {
981
		btrfs_tree_lock(left);
982
		wret = btrfs_cow_block(trans, root, left,
983
				       parent, pslot - 1, &left, 0);
984 985 986 987
		if (wret) {
			ret = wret;
			goto enospc;
		}
988
	}
989 990
	right = read_node_slot(root, parent, pslot + 1);
	if (right) {
991
		btrfs_tree_lock(right);
992
		wret = btrfs_cow_block(trans, root, right,
993
				       parent, pslot + 1, &right, 0);
994 995 996 997 998 999 1000
		if (wret) {
			ret = wret;
			goto enospc;
		}
	}

	/* first, try to make some room in the middle buffer */
1001 1002
	if (left) {
		orig_slot += btrfs_header_nritems(left);
1003
		wret = push_node_left(trans, root, left, mid, 1);
1004 1005
		if (wret < 0)
			ret = wret;
1006
		if (btrfs_header_nritems(mid) < 2)
1007
			err_on_enospc = 1;
1008
	}
1009 1010 1011 1012

	/*
	 * then try to empty the right most buffer into the middle
	 */
1013
	if (right) {
1014
		wret = push_node_left(trans, root, mid, right, 1);
1015
		if (wret < 0 && wret != -ENOSPC)
1016
			ret = wret;
1017
		if (btrfs_header_nritems(right) == 0) {
1018
			u64 bytenr = right->start;
1019
			u64 generation = btrfs_header_generation(parent);
1020 1021
			u32 blocksize = right->len;

1022
			clean_tree_block(trans, root, right);
1023
			btrfs_tree_unlock(right);
1024
			free_extent_buffer(right);
1025
			right = NULL;
1026 1027
			wret = del_ptr(trans, root, path, level + 1, pslot +
				       1);
1028 1029
			if (wret)
				ret = wret;
1030
			wret = btrfs_free_extent(trans, root, bytenr,
Z
Zheng Yan 已提交
1031
						 blocksize, parent->start,
1032
						 btrfs_header_owner(parent),
1033
						 generation, level, 1);
1034 1035 1036
			if (wret)
				ret = wret;
		} else {
1037 1038 1039 1040
			struct btrfs_disk_key right_key;
			btrfs_node_key(right, &right_key, 0);
			btrfs_set_node_key(parent, &right_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);
1041 1042
		}
	}
1043
	if (btrfs_header_nritems(mid) == 1) {
1044 1045 1046 1047 1048 1049 1050 1051 1052
		/*
		 * we're not allowed to leave a node with one item in the
		 * tree during a delete.  A deletion from lower in the tree
		 * could try to delete the only pointer in this node.
		 * So, pull some keys from the left.
		 * There has to be a left pointer at this point because
		 * otherwise we would have pulled some pointers from the
		 * right
		 */
1053 1054
		BUG_ON(!left);
		wret = balance_node_right(trans, root, mid, left);
1055
		if (wret < 0) {
1056
			ret = wret;
1057 1058
			goto enospc;
		}
1059 1060 1061 1062 1063
		if (wret == 1) {
			wret = push_node_left(trans, root, left, mid, 1);
			if (wret < 0)
				ret = wret;
		}
1064 1065
		BUG_ON(wret == 1);
	}
1066
	if (btrfs_header_nritems(mid) == 0) {
1067
		/* we've managed to empty the middle node, drop it */
1068
		u64 root_gen = btrfs_header_generation(parent);
1069 1070
		u64 bytenr = mid->start;
		u32 blocksize = mid->len;
1071

1072
		clean_tree_block(trans, root, mid);
1073
		btrfs_tree_unlock(mid);
1074
		free_extent_buffer(mid);
1075
		mid = NULL;
1076
		wret = del_ptr(trans, root, path, level + 1, pslot);
1077 1078
		if (wret)
			ret = wret;
1079
		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
Z
Zheng Yan 已提交
1080
					 parent->start,
1081
					 btrfs_header_owner(parent),
1082
					 root_gen, level, 1);
1083 1084
		if (wret)
			ret = wret;
1085 1086
	} else {
		/* update the parent key to reflect our changes */
1087 1088 1089 1090
		struct btrfs_disk_key mid_key;
		btrfs_node_key(mid, &mid_key, 0);
		btrfs_set_node_key(parent, &mid_key, pslot);
		btrfs_mark_buffer_dirty(parent);
1091
	}
1092

1093
	/* update the path */
1094 1095 1096
	if (left) {
		if (btrfs_header_nritems(left) > orig_slot) {
			extent_buffer_get(left);
1097
			/* left was locked after cow */
1098
			path->nodes[level] = left;
1099 1100
			path->slots[level + 1] -= 1;
			path->slots[level] = orig_slot;
1101 1102
			if (mid) {
				btrfs_tree_unlock(mid);
1103
				free_extent_buffer(mid);
1104
			}
1105
		} else {
1106
			orig_slot -= btrfs_header_nritems(left);
1107 1108 1109
			path->slots[level] = orig_slot;
		}
	}
1110
	/* double check we haven't messed things up */
C
Chris Mason 已提交
1111
	check_block(root, path, level);
C
Chris Mason 已提交
1112
	if (orig_ptr !=
1113
	    btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1114
		BUG();
1115
enospc:
1116 1117
	if (right) {
		btrfs_tree_unlock(right);
1118
		free_extent_buffer(right);
1119 1120 1121 1122
	}
	if (left) {
		if (path->nodes[level] != left)
			btrfs_tree_unlock(left);
1123
		free_extent_buffer(left);
1124
	}
1125 1126 1127
	return ret;
}

C
Chris Mason 已提交
1128 1129 1130 1131
/* Node balancing for insertion.  Here we only split or push nodes around
 * when they are completely full.  This is also done top down, so we
 * have to be pessimistic.
 */
1132 1133 1134
static int noinline push_nodes_for_insert(struct btrfs_trans_handle *trans,
					  struct btrfs_root *root,
					  struct btrfs_path *path, int level)
1135
{
1136 1137 1138 1139
	struct extent_buffer *right = NULL;
	struct extent_buffer *mid;
	struct extent_buffer *left = NULL;
	struct extent_buffer *parent = NULL;
1140 1141 1142 1143 1144 1145 1146 1147 1148
	int ret = 0;
	int wret;
	int pslot;
	int orig_slot = path->slots[level];
	u64 orig_ptr;

	if (level == 0)
		return 1;

1149
	mid = path->nodes[level];
1150
	WARN_ON(btrfs_header_generation(mid) != trans->transid);
1151 1152 1153
	orig_ptr = btrfs_node_blockptr(mid, orig_slot);

	if (level < BTRFS_MAX_LEVEL - 1)
1154
		parent = path->nodes[level + 1];
1155 1156
	pslot = path->slots[level + 1];

1157
	if (!parent)
1158 1159
		return 1;

1160
	left = read_node_slot(root, parent, pslot - 1);
1161 1162

	/* first, try to make some room in the middle buffer */
1163
	if (left) {
1164
		u32 left_nr;
1165 1166

		btrfs_tree_lock(left);
1167
		left_nr = btrfs_header_nritems(left);
C
Chris Mason 已提交
1168 1169 1170
		if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1171
			ret = btrfs_cow_block(trans, root, left, parent,
1172
					      pslot - 1, &left, 0);
1173 1174 1175 1176
			if (ret)
				wret = 1;
			else {
				wret = push_node_left(trans, root,
1177
						      left, mid, 0);
1178
			}
C
Chris Mason 已提交
1179
		}
1180 1181 1182
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1183
			struct btrfs_disk_key disk_key;
1184
			orig_slot += left_nr;
1185 1186 1187 1188 1189
			btrfs_node_key(mid, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot);
			btrfs_mark_buffer_dirty(parent);
			if (btrfs_header_nritems(left) > orig_slot) {
				path->nodes[level] = left;
1190 1191
				path->slots[level + 1] -= 1;
				path->slots[level] = orig_slot;
1192
				btrfs_tree_unlock(mid);
1193
				free_extent_buffer(mid);
1194 1195
			} else {
				orig_slot -=
1196
					btrfs_header_nritems(left);
1197
				path->slots[level] = orig_slot;
1198
				btrfs_tree_unlock(left);
1199
				free_extent_buffer(left);
1200 1201 1202
			}
			return 0;
		}
1203
		btrfs_tree_unlock(left);
1204
		free_extent_buffer(left);
1205
	}
1206
	right = read_node_slot(root, parent, pslot + 1);
1207 1208 1209 1210

	/*
	 * then try to empty the right most buffer into the middle
	 */
1211
	if (right) {
C
Chris Mason 已提交
1212
		u32 right_nr;
1213
		btrfs_tree_lock(right);
1214
		right_nr = btrfs_header_nritems(right);
C
Chris Mason 已提交
1215 1216 1217
		if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
			wret = 1;
		} else {
1218 1219
			ret = btrfs_cow_block(trans, root, right,
					      parent, pslot + 1,
1220
					      &right, 0);
1221 1222 1223 1224
			if (ret)
				wret = 1;
			else {
				wret = balance_node_right(trans, root,
1225
							  right, mid);
1226
			}
C
Chris Mason 已提交
1227
		}
1228 1229 1230
		if (wret < 0)
			ret = wret;
		if (wret == 0) {
1231 1232 1233 1234 1235 1236 1237 1238
			struct btrfs_disk_key disk_key;

			btrfs_node_key(right, &disk_key, 0);
			btrfs_set_node_key(parent, &disk_key, pslot + 1);
			btrfs_mark_buffer_dirty(parent);

			if (btrfs_header_nritems(mid) <= orig_slot) {
				path->nodes[level] = right;
1239 1240
				path->slots[level + 1] += 1;
				path->slots[level] = orig_slot -
1241
					btrfs_header_nritems(mid);
1242
				btrfs_tree_unlock(mid);
1243
				free_extent_buffer(mid);
1244
			} else {
1245
				btrfs_tree_unlock(right);
1246
				free_extent_buffer(right);
1247 1248 1249
			}
			return 0;
		}
1250
		btrfs_tree_unlock(right);
1251
		free_extent_buffer(right);
1252 1253 1254 1255
	}
	return 1;
}

1256
/*
C
Chris Mason 已提交
1257 1258
 * readahead one full node of leaves, finding things that are close
 * to the block in 'slot', and triggering ra on them.
1259
 */
1260 1261 1262
static noinline void reada_for_search(struct btrfs_root *root,
				      struct btrfs_path *path,
				      int level, int slot, u64 objectid)
1263
{
1264
	struct extent_buffer *node;
1265
	struct btrfs_disk_key disk_key;
1266 1267
	u32 nritems;
	u64 search;
1268 1269 1270
	u64 lowest_read;
	u64 highest_read;
	u64 nread = 0;
1271
	int direction = path->reada;
1272
	struct extent_buffer *eb;
1273 1274 1275
	u32 nr;
	u32 blocksize;
	u32 nscan = 0;
1276

1277
	if (level != 1)
1278 1279 1280
		return;

	if (!path->nodes[level])
1281 1282
		return;

1283
	node = path->nodes[level];
1284

1285
	search = btrfs_node_blockptr(node, slot);
1286 1287
	blocksize = btrfs_level_size(root, level - 1);
	eb = btrfs_find_tree_block(root, search, blocksize);
1288 1289
	if (eb) {
		free_extent_buffer(eb);
1290 1291 1292
		return;
	}

1293 1294 1295
	highest_read = search;
	lowest_read = search;

1296
	nritems = btrfs_header_nritems(node);
1297
	nr = slot;
1298
	while(1) {
1299 1300 1301 1302 1303 1304 1305 1306
		if (direction < 0) {
			if (nr == 0)
				break;
			nr--;
		} else if (direction > 0) {
			nr++;
			if (nr >= nritems)
				break;
1307
		}
1308 1309 1310 1311 1312
		if (path->reada < 0 && objectid) {
			btrfs_node_key(node, &disk_key, nr);
			if (btrfs_disk_key_objectid(&disk_key) != objectid)
				break;
		}
1313 1314
		search = btrfs_node_blockptr(node, nr);
		if ((search >= lowest_read && search <= highest_read) ||
1315 1316
		    (search < lowest_read && lowest_read - search <= 16384) ||
		    (search > highest_read && search - highest_read <= 16384)) {
1317 1318
			readahead_tree_block(root, search, blocksize,
				     btrfs_node_ptr_generation(node, nr));
1319 1320 1321
			nread += blocksize;
		}
		nscan++;
1322
		if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32))
1323
			break;
1324
		if(nread > (256 * 1024) || nscan > 128)
1325 1326 1327 1328 1329 1330
			break;

		if (search < lowest_read)
			lowest_read = search;
		if (search > highest_read)
			highest_read = search;
1331 1332
	}
}
1333

C
Chris Mason 已提交
1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346
/*
 * when we walk down the tree, it is usually safe to unlock the higher layers in
 * the tree.  The exceptions are when our path goes through slot 0, because operations
 * on the tree might require changing key pointers higher up in the tree.
 *
 * callers might also have set path->keep_locks, which tells this code to
 * keep the lock if the path points to the last slot in the block.  This is
 * part of walking through the tree, and selecting the next slot in the higher
 * block.
 *
 * lowest_unlock sets the lowest level in the tree we're allowed to unlock.
 * so if lowest_unlock is 1, level 0 won't be unlocked
 */
1347 1348
static noinline void unlock_up(struct btrfs_path *path, int level,
			       int lowest_unlock)
1349 1350 1351
{
	int i;
	int skip_level = level;
1352
	int no_skips = 0;
1353 1354 1355 1356 1357 1358 1359
	struct extent_buffer *t;

	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
		if (!path->nodes[i])
			break;
		if (!path->locks[i])
			break;
1360
		if (!no_skips && path->slots[i] == 0) {
1361 1362 1363
			skip_level = i + 1;
			continue;
		}
1364
		if (!no_skips && path->keep_locks) {
1365 1366 1367
			u32 nritems;
			t = path->nodes[i];
			nritems = btrfs_header_nritems(t);
1368
			if (nritems < 1 || path->slots[i] >= nritems - 1) {
1369 1370 1371 1372
				skip_level = i + 1;
				continue;
			}
		}
1373 1374 1375
		if (skip_level < i && i >= lowest_unlock)
			no_skips = 1;

1376 1377 1378 1379 1380 1381 1382 1383
		t = path->nodes[i];
		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
			btrfs_tree_unlock(t);
			path->locks[i] = 0;
		}
	}
}

C
Chris Mason 已提交
1384 1385 1386 1387 1388 1389
/*
 * look for key in the tree.  path is filled in with nodes along the way
 * if key is found, we return zero and you can find the item in the leaf
 * level of the path (level 0)
 *
 * If the key isn't found, the path points to the slot where it should
C
Chris Mason 已提交
1390 1391
 * be inserted, and 1 is returned.  If there are other errors during the
 * search a negative error number is returned.
C
Chris Mason 已提交
1392 1393 1394 1395
 *
 * if ins_len > 0, nodes and leaves will be split as we walk down the
 * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
 * possible)
C
Chris Mason 已提交
1396
 */
1397 1398 1399
int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *key, struct btrfs_path *p, int
		      ins_len, int cow)
1400
{
1401
	struct extent_buffer *b;
1402
	struct extent_buffer *tmp;
1403 1404 1405
	int slot;
	int ret;
	int level;
1406
	int should_reada = p->reada;
1407
	int lowest_unlock = 1;
1408
	int blocksize;
1409
	u8 lowest_level = 0;
1410 1411
	u64 blocknr;
	u64 gen;
1412
	struct btrfs_key prealloc_block;
1413

1414
	lowest_level = p->lowest_level;
1415
	WARN_ON(lowest_level && ins_len > 0);
C
Chris Mason 已提交
1416
	WARN_ON(p->nodes[0] != NULL);
1417

1418 1419
	if (ins_len < 0)
		lowest_unlock = 2;
1420 1421 1422

	prealloc_block.objectid = 0;

1423
again:
1424 1425 1426 1427
	if (p->skip_locking)
		b = btrfs_root_node(root);
	else
		b = btrfs_lock_root_node(root);
1428

1429
	while (b) {
1430
		level = btrfs_header_level(b);
1431 1432 1433 1434 1435 1436 1437 1438 1439

		/*
		 * setup the path here so we can release it under lock
		 * contention with the cow code
		 */
		p->nodes[level] = b;
		if (!p->skip_locking)
			p->locks[level] = 1;

C
Chris Mason 已提交
1440 1441
		if (cow) {
			int wret;
1442 1443 1444 1445

			/* is a cow on this block not required */
			spin_lock(&root->fs_info->hash_lock);
			if (btrfs_header_generation(b) == trans->transid &&
1446
			    btrfs_header_owner(b) == root->root_key.objectid &&
1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481
			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
				spin_unlock(&root->fs_info->hash_lock);
				goto cow_done;
			}
			spin_unlock(&root->fs_info->hash_lock);

			/* ok, we have to cow, is our old prealloc the right
			 * size?
			 */
			if (prealloc_block.objectid &&
			    prealloc_block.offset != b->len) {
				btrfs_free_reserved_extent(root,
					   prealloc_block.objectid,
					   prealloc_block.offset);
				prealloc_block.objectid = 0;
			}

			/*
			 * for higher level blocks, try not to allocate blocks
			 * with the block and the parent locks held.
			 */
			if (level > 1 && !prealloc_block.objectid &&
			    btrfs_path_lock_waiting(p, level)) {
				u32 size = b->len;
				u64 hint = b->start;

				btrfs_release_path(root, p);
				ret = btrfs_reserve_extent(trans, root,
							   size, size, 0,
							   hint, (u64)-1,
							   &prealloc_block, 0);
				BUG_ON(ret);
				goto again;
			}

C
Chris Mason 已提交
1482 1483 1484
			wret = btrfs_cow_block(trans, root, b,
					       p->nodes[level + 1],
					       p->slots[level + 1],
1485 1486
					       &b, prealloc_block.objectid);
			prealloc_block.objectid = 0;
1487
			if (wret) {
1488
				free_extent_buffer(b);
1489 1490
				ret = wret;
				goto done;
1491
			}
C
Chris Mason 已提交
1492
		}
1493
cow_done:
C
Chris Mason 已提交
1494
		BUG_ON(!cow && ins_len);
1495
		if (level != btrfs_header_level(b))
C
Chris Mason 已提交
1496
			WARN_ON(1);
1497
		level = btrfs_header_level(b);
1498

1499
		p->nodes[level] = b;
1500 1501
		if (!p->skip_locking)
			p->locks[level] = 1;
1502

C
Chris Mason 已提交
1503
		ret = check_block(root, p, level);
1504 1505 1506 1507
		if (ret) {
			ret = -1;
			goto done;
		}
1508

1509 1510
		ret = bin_search(b, key, level, &slot);
		if (level != 0) {
1511 1512 1513
			if (ret && slot > 0)
				slot -= 1;
			p->slots[level] = slot;
1514
			if (ins_len > 0 && btrfs_header_nritems(b) >=
1515
			    BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1516
				int sret = split_node(trans, root, p, level);
C
Chris Mason 已提交
1517
				BUG_ON(sret > 0);
1518 1519 1520 1521
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1522 1523
				b = p->nodes[level];
				slot = p->slots[level];
1524
			} else if (ins_len < 0) {
1525 1526
				int sret = balance_level(trans, root, p,
							 level);
1527 1528 1529 1530
				if (sret) {
					ret = sret;
					goto done;
				}
1531
				b = p->nodes[level];
1532 1533
				if (!b) {
					btrfs_release_path(NULL, p);
1534
					goto again;
1535
				}
1536
				slot = p->slots[level];
1537
				BUG_ON(btrfs_header_nritems(b) == 1);
C
Chris Mason 已提交
1538
			}
1539 1540
			unlock_up(p, level, lowest_unlock);

1541
			/* this is only true while dropping a snapshot */
1542
			if (level == lowest_level) {
1543 1544
				ret = 0;
				goto done;
1545
			}
1546

1547 1548 1549 1550 1551 1552
			blocknr = btrfs_node_blockptr(b, slot);
			gen = btrfs_node_ptr_generation(b, slot);
			blocksize = btrfs_level_size(root, level - 1);

			tmp = btrfs_find_tree_block(root, blocknr, blocksize);
			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1553 1554 1555 1556 1557 1558 1559 1560 1561
				b = tmp;
			} else {
				/*
				 * reduce lock contention at high levels
				 * of the btree by dropping locks before
				 * we read.
				 */
				if (level > 1) {
					btrfs_release_path(NULL, p);
1562 1563
					if (tmp)
						free_extent_buffer(tmp);
1564 1565 1566 1567 1568
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);

1569 1570
					tmp = read_tree_block(root, blocknr,
							 blocksize, gen);
1571 1572 1573 1574
					if (tmp)
						free_extent_buffer(tmp);
					goto again;
				} else {
1575 1576
					if (tmp)
						free_extent_buffer(tmp);
1577 1578 1579 1580
					if (should_reada)
						reada_for_search(root, p,
								 level, slot,
								 key->objectid);
1581 1582 1583
					b = read_node_slot(root, b, slot);
				}
			}
1584 1585
			if (!p->skip_locking)
				btrfs_tree_lock(b);
1586 1587
		} else {
			p->slots[level] = slot;
1588
			if (ins_len > 0 && btrfs_leaf_free_space(root, b) <
C
Chris Mason 已提交
1589
			    sizeof(struct btrfs_item) + ins_len) {
1590
				int sret = split_leaf(trans, root, key,
1591
						      p, ins_len, ret == 0);
C
Chris Mason 已提交
1592
				BUG_ON(sret > 0);
1593 1594 1595 1596
				if (sret) {
					ret = sret;
					goto done;
				}
C
Chris Mason 已提交
1597
			}
1598
			unlock_up(p, level, lowest_unlock);
1599
			goto done;
1600 1601
		}
	}
1602 1603 1604 1605 1606 1607 1608 1609 1610
	ret = 1;
done:
	if (prealloc_block.objectid) {
		btrfs_free_reserved_extent(root,
			   prealloc_block.objectid,
			   prealloc_block.offset);
	}

	return ret;
1611 1612
}

Z
Zheng Yan 已提交
1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652
int btrfs_merge_path(struct btrfs_trans_handle *trans,
		     struct btrfs_root *root,
		     struct btrfs_key *node_keys,
		     u64 *nodes, int lowest_level)
{
	struct extent_buffer *eb;
	struct extent_buffer *parent;
	struct btrfs_key key;
	u64 bytenr;
	u64 generation;
	u32 blocksize;
	int level;
	int slot;
	int key_match;
	int ret;

	eb = btrfs_lock_root_node(root);
	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
	BUG_ON(ret);

	parent = eb;
	while (1) {
		level = btrfs_header_level(parent);
		if (level == 0 || level <= lowest_level)
			break;

		ret = bin_search(parent, &node_keys[lowest_level], level,
				 &slot);
		if (ret && slot > 0)
			slot--;

		bytenr = btrfs_node_blockptr(parent, slot);
		if (nodes[level - 1] == bytenr)
			break;

		blocksize = btrfs_level_size(root, level - 1);
		generation = btrfs_node_ptr_generation(parent, slot);
		btrfs_node_key_to_cpu(eb, &key, slot);
		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));

Y
Yan Zheng 已提交
1653 1654 1655 1656 1657 1658
		if (generation == trans->transid) {
			eb = read_tree_block(root, bytenr, blocksize,
					     generation);
			btrfs_tree_lock(eb);
		}

Z
Zheng Yan 已提交
1659 1660 1661
		/*
		 * if node keys match and node pointer hasn't been modified
		 * in the running transaction, we can merge the path. for
Y
Yan Zheng 已提交
1662 1663 1664
		 * blocks owened by reloc trees, the node pointer check is
		 * skipped, this is because these blocks are fully controlled
		 * by the space balance code, no one else can modify them.
Z
Zheng Yan 已提交
1665 1666 1667
		 */
		if (!nodes[level - 1] || !key_match ||
		    (generation == trans->transid &&
Y
Yan Zheng 已提交
1668 1669 1670 1671 1672 1673
		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
			if (level == 1 || level == lowest_level + 1) {
				if (generation == trans->transid) {
					btrfs_tree_unlock(eb);
					free_extent_buffer(eb);
				}
Z
Zheng Yan 已提交
1674
				break;
Y
Yan Zheng 已提交
1675
			}
Z
Zheng Yan 已提交
1676

Y
Yan Zheng 已提交
1677 1678 1679 1680 1681
			if (generation != trans->transid) {
				eb = read_tree_block(root, bytenr, blocksize,
						generation);
				btrfs_tree_lock(eb);
			}
Z
Zheng Yan 已提交
1682 1683 1684 1685 1686

			ret = btrfs_cow_block(trans, root, eb, parent, slot,
					      &eb, 0);
			BUG_ON(ret);

Y
Yan Zheng 已提交
1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
			if (root->root_key.objectid ==
			    BTRFS_TREE_RELOC_OBJECTID) {
				if (!nodes[level - 1]) {
					nodes[level - 1] = eb->start;
					memcpy(&node_keys[level - 1], &key,
					       sizeof(node_keys[0]));
				} else {
					WARN_ON(1);
				}
			}

Z
Zheng Yan 已提交
1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712
			btrfs_tree_unlock(parent);
			free_extent_buffer(parent);
			parent = eb;
			continue;
		}

		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
		btrfs_mark_buffer_dirty(parent);

		ret = btrfs_inc_extent_ref(trans, root,
					nodes[level - 1],
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
1713
					level - 1);
Z
Zheng Yan 已提交
1714 1715
		BUG_ON(ret);

Y
Yan Zheng 已提交
1716 1717 1718 1719 1720
		/*
		 * If the block was created in the running transaction,
		 * it's possible this is the last reference to it, so we
		 * should drop the subtree.
		 */
Z
Zheng Yan 已提交
1721
		if (generation == trans->transid) {
Y
Yan Zheng 已提交
1722 1723
			ret = btrfs_drop_subtree(trans, root, eb, parent);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1724 1725
			btrfs_tree_unlock(eb);
			free_extent_buffer(eb);
Y
Yan Zheng 已提交
1726 1727 1728 1729 1730 1731 1732
		} else {
			ret = btrfs_free_extent(trans, root, bytenr,
					blocksize, parent->start,
					btrfs_header_owner(parent),
					btrfs_header_generation(parent),
					level - 1, 1);
			BUG_ON(ret);
Z
Zheng Yan 已提交
1733 1734 1735 1736 1737 1738 1739 1740
		}
		break;
	}
	btrfs_tree_unlock(parent);
	free_extent_buffer(parent);
	return 0;
}

C
Chris Mason 已提交
1741 1742 1743 1744 1745 1746
/*
 * adjust the pointers going up the tree, starting at level
 * making sure the right key of each node is points to 'key'.
 * This is used after shifting pointers to the left, so it stops
 * fixing up pointers when a given leaf/node is not in slot 0 of the
 * higher levels
C
Chris Mason 已提交
1747 1748 1749
 *
 * If this fails to write a tree block, it returns -1, but continues
 * fixing up the blocks in ram so the tree is consistent.
C
Chris Mason 已提交
1750
 */
1751 1752 1753
static int fixup_low_keys(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct btrfs_path *path,
			  struct btrfs_disk_key *key, int level)
1754 1755
{
	int i;
C
Chris Mason 已提交
1756
	int ret = 0;
1757 1758
	struct extent_buffer *t;

C
Chris Mason 已提交
1759
	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1760
		int tslot = path->slots[i];
1761
		if (!path->nodes[i])
1762
			break;
1763 1764
		t = path->nodes[i];
		btrfs_set_node_key(t, key, tslot);
C
Chris Mason 已提交
1765
		btrfs_mark_buffer_dirty(path->nodes[i]);
1766 1767 1768
		if (tslot != 0)
			break;
	}
C
Chris Mason 已提交
1769
	return ret;
1770 1771
}

Z
Zheng Yan 已提交
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806
/*
 * update item key.
 *
 * This function isn't completely safe. It's the caller's responsibility
 * that the new key won't break the order
 */
int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root, struct btrfs_path *path,
			    struct btrfs_key *new_key)
{
	struct btrfs_disk_key disk_key;
	struct extent_buffer *eb;
	int slot;

	eb = path->nodes[0];
	slot = path->slots[0];
	if (slot > 0) {
		btrfs_item_key(eb, &disk_key, slot - 1);
		if (comp_keys(&disk_key, new_key) >= 0)
			return -1;
	}
	if (slot < btrfs_header_nritems(eb) - 1) {
		btrfs_item_key(eb, &disk_key, slot + 1);
		if (comp_keys(&disk_key, new_key) <= 0)
			return -1;
	}

	btrfs_cpu_key_to_disk(&disk_key, new_key);
	btrfs_set_item_key(eb, &disk_key, slot);
	btrfs_mark_buffer_dirty(eb);
	if (slot == 0)
		fixup_low_keys(trans, root, path, &disk_key, 1);
	return 0;
}

C
Chris Mason 已提交
1807 1808
/*
 * try to push data from one node into the next node left in the
1809
 * tree.
C
Chris Mason 已提交
1810 1811 1812
 *
 * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
 * error, and > 0 if there was no room in the left hand block.
C
Chris Mason 已提交
1813
 */
1814 1815
static int push_node_left(struct btrfs_trans_handle *trans,
			  struct btrfs_root *root, struct extent_buffer *dst,
1816
			  struct extent_buffer *src, int empty)
1817 1818
{
	int push_items = 0;
1819 1820
	int src_nritems;
	int dst_nritems;
C
Chris Mason 已提交
1821
	int ret = 0;
1822

1823 1824
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1825
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1826 1827
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);
1828

1829
	if (!empty && src_nritems <= 8)
1830 1831
		return 1;

1832
	if (push_items <= 0) {
1833
		return 1;
1834
	}
1835

1836
	if (empty) {
1837
		push_items = min(src_nritems, push_items);
1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849
		if (push_items < src_nritems) {
			/* leave at least 8 pointers in the node if
			 * we aren't going to empty it
			 */
			if (src_nritems - push_items < 8) {
				if (push_items <= 8)
					return 1;
				push_items -= 8;
			}
		}
	} else
		push_items = min(src_nritems - 8, push_items);
1850

1851 1852 1853 1854 1855
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(dst_nritems),
			   btrfs_node_key_ptr_offset(0),
		           push_items * sizeof(struct btrfs_key_ptr));

1856
	if (push_items < src_nritems) {
1857 1858 1859 1860 1861 1862 1863 1864 1865
		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
				      btrfs_node_key_ptr_offset(push_items),
				      (src_nritems - push_items) *
				      sizeof(struct btrfs_key_ptr));
	}
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1866 1867 1868 1869

	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
	BUG_ON(ret);

1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881
	return ret;
}

/*
 * try to push data from one node into the next node right in the
 * tree.
 *
 * returns 0 if some ptrs were pushed, < 0 if there was some horrible
 * error, and > 0 if there was no room in the right hand block.
 *
 * this will  only push up to 1/2 the contents of the left node over
 */
1882 1883 1884 1885
static int balance_node_right(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root,
			      struct extent_buffer *dst,
			      struct extent_buffer *src)
1886 1887 1888 1889 1890 1891 1892
{
	int push_items = 0;
	int max_push;
	int src_nritems;
	int dst_nritems;
	int ret = 0;

1893 1894 1895
	WARN_ON(btrfs_header_generation(src) != trans->transid);
	WARN_ON(btrfs_header_generation(dst) != trans->transid);

1896 1897
	src_nritems = btrfs_header_nritems(src);
	dst_nritems = btrfs_header_nritems(dst);
C
Chris Mason 已提交
1898
	push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
1899
	if (push_items <= 0) {
1900
		return 1;
1901 1902 1903 1904 1905
	}

	if (src_nritems < 4) {
		return 1;
	}
1906 1907 1908

	max_push = src_nritems / 2 + 1;
	/* don't try to empty the node */
1909
	if (max_push >= src_nritems) {
1910
		return 1;
1911
	}
Y
Yan 已提交
1912

1913 1914 1915
	if (max_push < push_items)
		push_items = max_push;

1916 1917 1918 1919
	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
				      btrfs_node_key_ptr_offset(0),
				      (dst_nritems) *
				      sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
1920

1921 1922 1923 1924
	copy_extent_buffer(dst, src,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(src_nritems - push_items),
		           push_items * sizeof(struct btrfs_key_ptr));
1925

1926 1927
	btrfs_set_header_nritems(src, src_nritems - push_items);
	btrfs_set_header_nritems(dst, dst_nritems + push_items);
1928

1929 1930
	btrfs_mark_buffer_dirty(src);
	btrfs_mark_buffer_dirty(dst);
Z
Zheng Yan 已提交
1931 1932 1933 1934

	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
	BUG_ON(ret);

C
Chris Mason 已提交
1935
	return ret;
1936 1937
}

C
Chris Mason 已提交
1938 1939 1940 1941
/*
 * helper function to insert a new root level in the tree.
 * A new node is allocated, and a single item is inserted to
 * point to the existing root
C
Chris Mason 已提交
1942 1943
 *
 * returns zero on success or < 0 on failure.
C
Chris Mason 已提交
1944
 */
1945
static int noinline insert_new_root(struct btrfs_trans_handle *trans,
1946 1947
			   struct btrfs_root *root,
			   struct btrfs_path *path, int level)
C
Chris Mason 已提交
1948
{
1949
	u64 lower_gen;
1950 1951
	struct extent_buffer *lower;
	struct extent_buffer *c;
1952
	struct extent_buffer *old;
1953
	struct btrfs_disk_key lower_key;
Z
Zheng Yan 已提交
1954
	int ret;
C
Chris Mason 已提交
1955 1956 1957 1958

	BUG_ON(path->nodes[level]);
	BUG_ON(path->nodes[level-1] != root->node);

1959 1960 1961 1962 1963 1964
	lower = path->nodes[level-1];
	if (level == 1)
		btrfs_item_key(lower, &lower_key, 0);
	else
		btrfs_node_key(lower, &lower_key, 0);

Z
Zheng Yan 已提交
1965 1966
	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
				   root->root_key.objectid, trans->transid,
1967
				   level, root->node->start, 0);
1968 1969
	if (IS_ERR(c))
		return PTR_ERR(c);
1970

1971 1972 1973
	memset_extent_buffer(c, 0, 0, root->nodesize);
	btrfs_set_header_nritems(c, 1);
	btrfs_set_header_level(c, level);
1974
	btrfs_set_header_bytenr(c, c->start);
1975 1976 1977 1978 1979 1980
	btrfs_set_header_generation(c, trans->transid);
	btrfs_set_header_owner(c, root->root_key.objectid);

	write_extent_buffer(c, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(c),
			    BTRFS_FSID_SIZE);
1981 1982 1983 1984 1985

	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
			    BTRFS_UUID_SIZE);

1986
	btrfs_set_node_key(c, &lower_key, 0);
1987
	btrfs_set_node_blockptr(c, 0, lower->start);
1988
	lower_gen = btrfs_header_generation(lower);
Z
Zheng Yan 已提交
1989
	WARN_ON(lower_gen != trans->transid);
1990 1991

	btrfs_set_node_ptr_generation(c, 0, lower_gen);
1992

1993
	btrfs_mark_buffer_dirty(c);
1994

1995 1996
	spin_lock(&root->node_lock);
	old = root->node;
1997
	root->node = c;
1998 1999
	spin_unlock(&root->node_lock);

Z
Zheng Yan 已提交
2000 2001 2002
	ret = btrfs_update_extent_ref(trans, root, lower->start,
				      lower->start, c->start,
				      root->root_key.objectid,
2003
				      trans->transid, level - 1);
Z
Zheng Yan 已提交
2004 2005
	BUG_ON(ret);

2006 2007 2008
	/* the super has an extra ref to root->node */
	free_extent_buffer(old);

2009
	add_root_to_dirty_list(root);
2010 2011
	extent_buffer_get(c);
	path->nodes[level] = c;
2012
	path->locks[level] = 1;
C
Chris Mason 已提交
2013 2014 2015 2016
	path->slots[level] = 0;
	return 0;
}

C
Chris Mason 已提交
2017 2018 2019
/*
 * worker function to insert a single pointer in a node.
 * the node should have enough room for the pointer already
C
Chris Mason 已提交
2020
 *
C
Chris Mason 已提交
2021 2022
 * slot and level indicate where you want the key to go, and
 * blocknr is the block the key points to.
C
Chris Mason 已提交
2023 2024
 *
 * returns zero on success and < 0 on any error
C
Chris Mason 已提交
2025
 */
2026 2027
static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_path *path, struct btrfs_disk_key
2028
		      *key, u64 bytenr, int slot, int level)
C
Chris Mason 已提交
2029
{
2030
	struct extent_buffer *lower;
C
Chris Mason 已提交
2031
	int nritems;
C
Chris Mason 已提交
2032 2033

	BUG_ON(!path->nodes[level]);
2034 2035
	lower = path->nodes[level];
	nritems = btrfs_header_nritems(lower);
C
Chris Mason 已提交
2036 2037
	if (slot > nritems)
		BUG();
C
Chris Mason 已提交
2038
	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
C
Chris Mason 已提交
2039 2040
		BUG();
	if (slot != nritems) {
2041 2042 2043
		memmove_extent_buffer(lower,
			      btrfs_node_key_ptr_offset(slot + 1),
			      btrfs_node_key_ptr_offset(slot),
C
Chris Mason 已提交
2044
			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
C
Chris Mason 已提交
2045
	}
2046
	btrfs_set_node_key(lower, key, slot);
2047
	btrfs_set_node_blockptr(lower, slot, bytenr);
2048 2049
	WARN_ON(trans->transid == 0);
	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2050 2051
	btrfs_set_header_nritems(lower, nritems + 1);
	btrfs_mark_buffer_dirty(lower);
C
Chris Mason 已提交
2052 2053 2054
	return 0;
}

C
Chris Mason 已提交
2055 2056 2057 2058 2059 2060
/*
 * split the node at the specified level in path in two.
 * The path is corrected to point to the appropriate node after the split
 *
 * Before splitting this tries to make some room in the node by pushing
 * left and right, if either one works, it returns right away.
C
Chris Mason 已提交
2061 2062
 *
 * returns 0 on success and < 0 on failure
C
Chris Mason 已提交
2063
 */
2064 2065 2066
static noinline int split_node(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_path *path, int level)
2067
{
2068 2069 2070
	struct extent_buffer *c;
	struct extent_buffer *split;
	struct btrfs_disk_key disk_key;
2071
	int mid;
C
Chris Mason 已提交
2072
	int ret;
C
Chris Mason 已提交
2073
	int wret;
2074
	u32 c_nritems;
2075

2076
	c = path->nodes[level];
2077
	WARN_ON(btrfs_header_generation(c) != trans->transid);
2078
	if (c == root->node) {
C
Chris Mason 已提交
2079
		/* trying to split the root, lets make a new one */
2080
		ret = insert_new_root(trans, root, path, level + 1);
C
Chris Mason 已提交
2081 2082
		if (ret)
			return ret;
2083 2084
	} else {
		ret = push_nodes_for_insert(trans, root, path, level);
2085 2086
		c = path->nodes[level];
		if (!ret && btrfs_header_nritems(c) <
2087
		    BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
2088
			return 0;
2089 2090
		if (ret < 0)
			return ret;
2091
	}
2092

2093
	c_nritems = btrfs_header_nritems(c);
2094

2095
	split = btrfs_alloc_free_block(trans, root, root->nodesize,
Z
Zheng Yan 已提交
2096 2097 2098
					path->nodes[level + 1]->start,
					root->root_key.objectid,
					trans->transid, level, c->start, 0);
2099 2100 2101 2102 2103
	if (IS_ERR(split))
		return PTR_ERR(split);

	btrfs_set_header_flags(split, btrfs_header_flags(c));
	btrfs_set_header_level(split, btrfs_header_level(c));
2104
	btrfs_set_header_bytenr(split, split->start);
2105 2106
	btrfs_set_header_generation(split, trans->transid);
	btrfs_set_header_owner(split, root->root_key.objectid);
2107
	btrfs_set_header_flags(split, 0);
2108 2109 2110
	write_extent_buffer(split, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(split),
			    BTRFS_FSID_SIZE);
2111 2112 2113
	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
			    BTRFS_UUID_SIZE);
2114

2115
	mid = (c_nritems + 1) / 2;
2116 2117 2118 2119 2120 2121 2122

	copy_extent_buffer(split, c,
			   btrfs_node_key_ptr_offset(0),
			   btrfs_node_key_ptr_offset(mid),
			   (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
	btrfs_set_header_nritems(split, c_nritems - mid);
	btrfs_set_header_nritems(c, mid);
C
Chris Mason 已提交
2123 2124
	ret = 0;

2125 2126 2127 2128
	btrfs_mark_buffer_dirty(c);
	btrfs_mark_buffer_dirty(split);

	btrfs_node_key(split, &disk_key, 0);
2129
	wret = insert_ptr(trans, root, path, &disk_key, split->start,
2130
			  path->slots[level + 1] + 1,
C
Chris Mason 已提交
2131
			  level + 1);
C
Chris Mason 已提交
2132 2133 2134
	if (wret)
		ret = wret;

Z
Zheng Yan 已提交
2135 2136 2137
	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
	BUG_ON(ret);

C
Chris Mason 已提交
2138
	if (path->slots[level] >= mid) {
C
Chris Mason 已提交
2139
		path->slots[level] -= mid;
2140
		btrfs_tree_unlock(c);
2141 2142
		free_extent_buffer(c);
		path->nodes[level] = split;
C
Chris Mason 已提交
2143 2144
		path->slots[level + 1] += 1;
	} else {
2145
		btrfs_tree_unlock(split);
2146
		free_extent_buffer(split);
2147
	}
C
Chris Mason 已提交
2148
	return ret;
2149 2150
}

C
Chris Mason 已提交
2151 2152 2153 2154 2155
/*
 * how many bytes are required to store the items in a leaf.  start
 * and nr indicate which items in the leaf to check.  This totals up the
 * space used both by the item structs and the item data
 */
2156
static int leaf_space_used(struct extent_buffer *l, int start, int nr)
2157 2158
{
	int data_len;
2159
	int nritems = btrfs_header_nritems(l);
2160
	int end = min(nritems, start + nr) - 1;
2161 2162 2163

	if (!nr)
		return 0;
2164 2165
	data_len = btrfs_item_end_nr(l, start);
	data_len = data_len - btrfs_item_offset_nr(l, end);
C
Chris Mason 已提交
2166
	data_len += sizeof(struct btrfs_item) * nr;
2167
	WARN_ON(data_len < 0);
2168 2169 2170
	return data_len;
}

2171 2172 2173 2174 2175
/*
 * The space between the end of the leaf items and
 * the start of the leaf data.  IOW, how much room
 * the leaf has left for both items and data
 */
2176 2177
int noinline btrfs_leaf_free_space(struct btrfs_root *root,
				   struct extent_buffer *leaf)
2178
{
2179 2180 2181 2182 2183
	int nritems = btrfs_header_nritems(leaf);
	int ret;
	ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
	if (ret < 0) {
		printk("leaf free space ret %d, leaf data size %lu, used %d nritems %d\n",
J
Jens Axboe 已提交
2184
		       ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
2185 2186 2187
		       leaf_space_used(leaf, 0, nritems), nritems);
	}
	return ret;
2188 2189
}

C
Chris Mason 已提交
2190 2191 2192
/*
 * push some data in the path leaf to the right, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
C
Chris Mason 已提交
2193 2194 2195
 *
 * returns 1 if the push failed because the other node didn't have enough
 * room, 0 if everything worked out and < 0 if there were major errors.
C
Chris Mason 已提交
2196
 */
2197
static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2198 2199
			   *root, struct btrfs_path *path, int data_size,
			   int empty)
C
Chris Mason 已提交
2200
{
2201 2202 2203 2204
	struct extent_buffer *left = path->nodes[0];
	struct extent_buffer *right;
	struct extent_buffer *upper;
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2205
	int slot;
2206
	u32 i;
C
Chris Mason 已提交
2207 2208 2209
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2210
	struct btrfs_item *item;
2211
	u32 left_nritems;
2212
	u32 nr;
2213
	u32 right_nritems;
2214
	u32 data_end;
2215
	u32 this_item_size;
2216
	int ret;
C
Chris Mason 已提交
2217 2218 2219 2220 2221 2222

	slot = path->slots[1];
	if (!path->nodes[1]) {
		return 1;
	}
	upper = path->nodes[1];
2223
	if (slot >= btrfs_header_nritems(upper) - 1)
C
Chris Mason 已提交
2224
		return 1;
2225

2226 2227
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2228
	right = read_node_slot(root, upper, slot + 1);
2229
	btrfs_tree_lock(right);
C
Chris Mason 已提交
2230
	free_space = btrfs_leaf_free_space(root, right);
2231 2232
	if (free_space < data_size + sizeof(struct btrfs_item))
		goto out_unlock;
2233

C
Chris Mason 已提交
2234
	/* cow and double check */
2235
	ret = btrfs_cow_block(trans, root, right, upper,
2236
			      slot + 1, &right, 0);
2237 2238 2239
	if (ret)
		goto out_unlock;

C
Chris Mason 已提交
2240
	free_space = btrfs_leaf_free_space(root, right);
2241 2242
	if (free_space < data_size + sizeof(struct btrfs_item))
		goto out_unlock;
C
Chris Mason 已提交
2243

2244
	left_nritems = btrfs_header_nritems(left);
2245 2246
	if (left_nritems == 0)
		goto out_unlock;
2247

2248 2249 2250 2251 2252
	if (empty)
		nr = 0;
	else
		nr = 1;

Z
Zheng Yan 已提交
2253 2254 2255
	if (path->slots[0] >= left_nritems)
		push_space += data_size + sizeof(*item);

2256 2257
	i = left_nritems - 1;
	while (i >= nr) {
2258
		item = btrfs_item_nr(left, i);
2259

Z
Zheng Yan 已提交
2260 2261 2262 2263 2264 2265 2266 2267 2268 2269
		if (!empty && push_items > 0) {
			if (path->slots[0] > i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, left);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

C
Chris Mason 已提交
2270 2271
		if (path->slots[0] == i)
			push_space += data_size + sizeof(*item);
2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282

		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

		this_item_size = btrfs_item_size(left, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
C
Chris Mason 已提交
2283
			break;
Z
Zheng Yan 已提交
2284

C
Chris Mason 已提交
2285
		push_items++;
2286
		push_space += this_item_size + sizeof(*item);
2287 2288 2289
		if (i == 0)
			break;
		i--;
2290 2291 2292 2293
	}
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
C
Chris Mason 已提交
2294
	}
2295

2296 2297
	if (push_items == 0)
		goto out_unlock;
2298

2299
	if (!empty && push_items == left_nritems)
2300
		WARN_ON(1);
2301

C
Chris Mason 已提交
2302
	/* push left to right */
2303
	right_nritems = btrfs_header_nritems(right);
2304

2305
	push_space = btrfs_item_end_nr(left, left_nritems - push_items);
C
Chris Mason 已提交
2306
	push_space -= leaf_data_end(root, left);
2307

C
Chris Mason 已提交
2308
	/* make room in the right data area */
2309 2310 2311 2312 2313 2314
	data_end = leaf_data_end(root, right);
	memmove_extent_buffer(right,
			      btrfs_leaf_data(right) + data_end - push_space,
			      btrfs_leaf_data(right) + data_end,
			      BTRFS_LEAF_DATA_SIZE(root) - data_end);

C
Chris Mason 已提交
2315
	/* copy from the left data area */
2316
	copy_extent_buffer(right, left, btrfs_leaf_data(right) +
C
Chris Mason 已提交
2317 2318 2319
		     BTRFS_LEAF_DATA_SIZE(root) - push_space,
		     btrfs_leaf_data(left) + leaf_data_end(root, left),
		     push_space);
2320 2321 2322 2323 2324

	memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
			      btrfs_item_nr_offset(0),
			      right_nritems * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2325
	/* copy the items from left to right */
2326 2327 2328
	copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
		   btrfs_item_nr_offset(left_nritems - push_items),
		   push_items * sizeof(struct btrfs_item));
C
Chris Mason 已提交
2329 2330

	/* update the item pointers */
2331
	right_nritems += push_items;
2332
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2333
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2334
	for (i = 0; i < right_nritems; i++) {
2335
		item = btrfs_item_nr(right, i);
2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}
		push_space -= btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
C
Chris Mason 已提交
2350
	}
2351
	left_nritems -= push_items;
2352
	btrfs_set_header_nritems(left, left_nritems);
C
Chris Mason 已提交
2353

2354 2355
	if (left_nritems)
		btrfs_mark_buffer_dirty(left);
2356
	btrfs_mark_buffer_dirty(right);
2357

Z
Zheng Yan 已提交
2358 2359 2360
	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
	BUG_ON(ret);

2361 2362
	btrfs_item_key(right, &disk_key, 0);
	btrfs_set_node_key(upper, &disk_key, slot + 1);
C
Chris Mason 已提交
2363
	btrfs_mark_buffer_dirty(upper);
C
Chris Mason 已提交
2364

C
Chris Mason 已提交
2365
	/* then fixup the leaf pointer in the path */
2366 2367
	if (path->slots[0] >= left_nritems) {
		path->slots[0] -= left_nritems;
2368 2369 2370
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2371 2372
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
C
Chris Mason 已提交
2373 2374
		path->slots[1] += 1;
	} else {
2375
		btrfs_tree_unlock(right);
2376
		free_extent_buffer(right);
C
Chris Mason 已提交
2377 2378
	}
	return 0;
2379 2380 2381 2382 2383

out_unlock:
	btrfs_tree_unlock(right);
	free_extent_buffer(right);
	return 1;
C
Chris Mason 已提交
2384
}
2385

C
Chris Mason 已提交
2386 2387 2388 2389
/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
2390
static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2391 2392
			  *root, struct btrfs_path *path, int data_size,
			  int empty)
2393
{
2394 2395 2396
	struct btrfs_disk_key disk_key;
	struct extent_buffer *right = path->nodes[0];
	struct extent_buffer *left;
2397 2398 2399 2400 2401
	int slot;
	int i;
	int free_space;
	int push_space = 0;
	int push_items = 0;
C
Chris Mason 已提交
2402
	struct btrfs_item *item;
2403
	u32 old_left_nritems;
2404
	u32 right_nritems;
2405
	u32 nr;
C
Chris Mason 已提交
2406 2407
	int ret = 0;
	int wret;
2408 2409
	u32 this_item_size;
	u32 old_left_item_size;
2410 2411

	slot = path->slots[1];
2412
	if (slot == 0)
2413
		return 1;
2414
	if (!path->nodes[1])
2415
		return 1;
2416

2417 2418 2419 2420 2421
	right_nritems = btrfs_header_nritems(right);
	if (right_nritems == 0) {
		return 1;
	}

2422 2423
	WARN_ON(!btrfs_tree_locked(path->nodes[1]));

2424
	left = read_node_slot(root, path->nodes[1], slot - 1);
2425
	btrfs_tree_lock(left);
C
Chris Mason 已提交
2426
	free_space = btrfs_leaf_free_space(root, left);
C
Chris Mason 已提交
2427
	if (free_space < data_size + sizeof(struct btrfs_item)) {
2428 2429
		ret = 1;
		goto out;
2430
	}
C
Chris Mason 已提交
2431 2432

	/* cow and double check */
2433
	ret = btrfs_cow_block(trans, root, left,
2434
			      path->nodes[1], slot - 1, &left, 0);
2435 2436
	if (ret) {
		/* we hit -ENOSPC, but it isn't fatal here */
2437 2438
		ret = 1;
		goto out;
2439
	}
2440

C
Chris Mason 已提交
2441
	free_space = btrfs_leaf_free_space(root, left);
C
Chris Mason 已提交
2442
	if (free_space < data_size + sizeof(struct btrfs_item)) {
2443 2444
		ret = 1;
		goto out;
C
Chris Mason 已提交
2445 2446
	}

2447 2448 2449 2450 2451 2452
	if (empty)
		nr = right_nritems;
	else
		nr = right_nritems - 1;

	for (i = 0; i < nr; i++) {
2453
		item = btrfs_item_nr(right, i);
2454 2455 2456 2457 2458 2459 2460 2461
		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

Z
Zheng Yan 已提交
2462 2463 2464 2465 2466 2467 2468 2469 2470 2471
		if (!empty && push_items > 0) {
			if (path->slots[0] < i)
				break;
			if (path->slots[0] == i) {
				int space = btrfs_leaf_free_space(root, right);
				if (space + push_space * 2 > free_space)
					break;
			}
		}

2472 2473
		if (path->slots[0] == i)
			push_space += data_size + sizeof(*item);
2474 2475 2476

		this_item_size = btrfs_item_size(right, item);
		if (this_item_size + sizeof(*item) + push_space > free_space)
2477
			break;
2478

2479
		push_items++;
2480 2481 2482 2483 2484 2485
		push_space += this_item_size + sizeof(*item);
	}

	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2486
	}
2487

2488
	if (push_items == 0) {
2489 2490
		ret = 1;
		goto out;
2491
	}
2492
	if (!empty && push_items == btrfs_header_nritems(right))
2493
		WARN_ON(1);
2494

2495
	/* push data from right to left */
2496 2497 2498 2499 2500
	copy_extent_buffer(left, right,
			   btrfs_item_nr_offset(btrfs_header_nritems(left)),
			   btrfs_item_nr_offset(0),
			   push_items * sizeof(struct btrfs_item));

C
Chris Mason 已提交
2501
	push_space = BTRFS_LEAF_DATA_SIZE(root) -
2502 2503 2504
		     btrfs_item_offset_nr(right, push_items -1);

	copy_extent_buffer(left, right, btrfs_leaf_data(left) +
C
Chris Mason 已提交
2505 2506
		     leaf_data_end(root, left) - push_space,
		     btrfs_leaf_data(right) +
2507
		     btrfs_item_offset_nr(right, push_items - 1),
C
Chris Mason 已提交
2508
		     push_space);
2509
	old_left_nritems = btrfs_header_nritems(left);
2510 2511
	BUG_ON(old_left_nritems < 0);

2512
	old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
C
Chris Mason 已提交
2513
	for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
2514
		u32 ioff;
2515

2516
		item = btrfs_item_nr(left, i);
2517 2518 2519 2520 2521 2522 2523 2524
		if (!left->map_token) {
			map_extent_buffer(left, (unsigned long)item,
					sizeof(struct btrfs_item),
					&left->map_token, &left->kaddr,
					&left->map_start, &left->map_len,
					KM_USER1);
		}

2525 2526
		ioff = btrfs_item_offset(left, item);
		btrfs_set_item_offset(left, item,
2527
		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2528
	}
2529
	btrfs_set_header_nritems(left, old_left_nritems + push_items);
2530 2531 2532 2533
	if (left->map_token) {
		unmap_extent_buffer(left, left->map_token, KM_USER1);
		left->map_token = NULL;
	}
2534 2535

	/* fixup right node */
2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549
	if (push_items > right_nritems) {
		printk("push items %d nr %u\n", push_items, right_nritems);
		WARN_ON(1);
	}

	if (push_items < right_nritems) {
		push_space = btrfs_item_offset_nr(right, push_items - 1) -
						  leaf_data_end(root, right);
		memmove_extent_buffer(right, btrfs_leaf_data(right) +
				      BTRFS_LEAF_DATA_SIZE(root) - push_space,
				      btrfs_leaf_data(right) +
				      leaf_data_end(root, right), push_space);

		memmove_extent_buffer(right, btrfs_item_nr_offset(0),
2550 2551 2552
			      btrfs_item_nr_offset(push_items),
			     (btrfs_header_nritems(right) - push_items) *
			     sizeof(struct btrfs_item));
2553
	}
2554 2555
	right_nritems -= push_items;
	btrfs_set_header_nritems(right, right_nritems);
C
Chris Mason 已提交
2556
	push_space = BTRFS_LEAF_DATA_SIZE(root);
2557 2558
	for (i = 0; i < right_nritems; i++) {
		item = btrfs_item_nr(right, i);
2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		push_space = push_space - btrfs_item_size(right, item);
		btrfs_set_item_offset(right, item, push_space);
	}
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
2574
	}
2575

2576
	btrfs_mark_buffer_dirty(left);
2577 2578
	if (right_nritems)
		btrfs_mark_buffer_dirty(right);
2579

Z
Zheng Yan 已提交
2580 2581 2582 2583
	ret = btrfs_update_ref(trans, root, right, left,
			       old_left_nritems, push_items);
	BUG_ON(ret);

2584 2585
	btrfs_item_key(right, &disk_key, 0);
	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
C
Chris Mason 已提交
2586 2587
	if (wret)
		ret = wret;
2588 2589 2590 2591

	/* then fixup the leaf pointer in the path */
	if (path->slots[0] < push_items) {
		path->slots[0] += old_left_nritems;
2592 2593 2594
		if (btrfs_header_nritems(path->nodes[0]) == 0)
			clean_tree_block(trans, root, path->nodes[0]);
		btrfs_tree_unlock(path->nodes[0]);
2595 2596
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = left;
2597 2598
		path->slots[1] -= 1;
	} else {
2599
		btrfs_tree_unlock(left);
2600
		free_extent_buffer(left);
2601 2602
		path->slots[0] -= push_items;
	}
2603
	BUG_ON(path->slots[0] < 0);
C
Chris Mason 已提交
2604
	return ret;
2605 2606 2607 2608
out:
	btrfs_tree_unlock(left);
	free_extent_buffer(left);
	return ret;
2609 2610
}

C
Chris Mason 已提交
2611 2612 2613
/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
C
Chris Mason 已提交
2614 2615
 *
 * returns 0 if all went well and < 0 on failure.
C
Chris Mason 已提交
2616
 */
2617 2618 2619 2620 2621
static noinline int split_leaf(struct btrfs_trans_handle *trans,
			       struct btrfs_root *root,
			       struct btrfs_key *ins_key,
			       struct btrfs_path *path, int data_size,
			       int extend)
2622
{
2623
	struct extent_buffer *l;
2624
	u32 nritems;
2625 2626
	int mid;
	int slot;
2627
	struct extent_buffer *right;
C
Chris Mason 已提交
2628
	int space_needed = data_size + sizeof(struct btrfs_item);
2629 2630 2631
	int data_copy_size;
	int rt_data_off;
	int i;
2632
	int ret = 0;
C
Chris Mason 已提交
2633
	int wret;
2634 2635
	int double_split;
	int num_doubles = 0;
2636
	struct btrfs_disk_key disk_key;
C
Chris Mason 已提交
2637

2638 2639 2640
	if (extend)
		space_needed = data_size;

C
Chris Mason 已提交
2641
	/* first try to make some room by pushing left and right */
2642
	if (ins_key->type != BTRFS_DIR_ITEM_KEY) {
2643
		wret = push_leaf_right(trans, root, path, data_size, 0);
2644
		if (wret < 0) {
C
Chris Mason 已提交
2645
			return wret;
2646 2647
		}
		if (wret) {
2648
			wret = push_leaf_left(trans, root, path, data_size, 0);
2649 2650 2651 2652
			if (wret < 0)
				return wret;
		}
		l = path->nodes[0];
C
Chris Mason 已提交
2653

2654
		/* did the pushes work? */
2655
		if (btrfs_leaf_free_space(root, l) >= space_needed)
2656
			return 0;
2657
	}
C
Chris Mason 已提交
2658

C
Chris Mason 已提交
2659
	if (!path->nodes[1]) {
2660
		ret = insert_new_root(trans, root, path, 1);
C
Chris Mason 已提交
2661 2662 2663
		if (ret)
			return ret;
	}
2664 2665 2666
again:
	double_split = 0;
	l = path->nodes[0];
2667
	slot = path->slots[0];
2668
	nritems = btrfs_header_nritems(l);
2669
	mid = (nritems + 1)/ 2;
2670

2671
	right = btrfs_alloc_free_block(trans, root, root->leafsize,
Z
Zheng Yan 已提交
2672 2673 2674
					path->nodes[1]->start,
					root->root_key.objectid,
					trans->transid, 0, l->start, 0);
2675 2676
	if (IS_ERR(right)) {
		BUG_ON(1);
2677
		return PTR_ERR(right);
2678
	}
2679 2680

	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2681
	btrfs_set_header_bytenr(right, right->start);
2682 2683 2684 2685 2686 2687
	btrfs_set_header_generation(right, trans->transid);
	btrfs_set_header_owner(right, root->root_key.objectid);
	btrfs_set_header_level(right, 0);
	write_extent_buffer(right, root->fs_info->fsid,
			    (unsigned long)btrfs_header_fsid(right),
			    BTRFS_FSID_SIZE);
2688 2689 2690 2691

	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
			    BTRFS_UUID_SIZE);
2692 2693 2694 2695 2696 2697
	if (mid <= slot) {
		if (nritems == 1 ||
		    leaf_space_used(l, mid, nritems - mid) + space_needed >
			BTRFS_LEAF_DATA_SIZE(root)) {
			if (slot >= nritems) {
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2698
				btrfs_set_header_nritems(right, 0);
2699
				wret = insert_ptr(trans, root, path,
2700
						  &disk_key, right->start,
2701 2702 2703
						  path->slots[1] + 1, 1);
				if (wret)
					ret = wret;
2704 2705

				btrfs_tree_unlock(path->nodes[0]);
2706 2707
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2708 2709
				path->slots[0] = 0;
				path->slots[1] += 1;
2710
				btrfs_mark_buffer_dirty(right);
2711 2712 2713
				return ret;
			}
			mid = slot;
2714 2715 2716 2717 2718
			if (mid != nritems &&
			    leaf_space_used(l, mid, nritems - mid) +
			    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
				double_split = 1;
			}
2719 2720 2721 2722
		}
	} else {
		if (leaf_space_used(l, 0, mid + 1) + space_needed >
			BTRFS_LEAF_DATA_SIZE(root)) {
2723
			if (!extend && slot == 0) {
2724
				btrfs_cpu_key_to_disk(&disk_key, ins_key);
2725
				btrfs_set_header_nritems(right, 0);
2726 2727
				wret = insert_ptr(trans, root, path,
						  &disk_key,
2728
						  right->start,
2729
						  path->slots[1], 1);
2730 2731
				if (wret)
					ret = wret;
2732
				btrfs_tree_unlock(path->nodes[0]);
2733 2734
				free_extent_buffer(path->nodes[0]);
				path->nodes[0] = right;
2735
				path->slots[0] = 0;
2736 2737 2738 2739 2740 2741
				if (path->slots[1] == 0) {
					wret = fixup_low_keys(trans, root,
					           path, &disk_key, 1);
					if (wret)
						ret = wret;
				}
2742
				btrfs_mark_buffer_dirty(right);
2743
				return ret;
2744 2745 2746 2747 2748 2749 2750 2751 2752
			} else if (extend && slot == 0) {
				mid = 1;
			} else {
				mid = slot;
				if (mid != nritems &&
				    leaf_space_used(l, mid, nritems - mid) +
				    space_needed > BTRFS_LEAF_DATA_SIZE(root)) {
					double_split = 1;
				}
2753
			}
2754 2755
		}
	}
2756 2757 2758 2759 2760 2761 2762 2763 2764
	nritems = nritems - mid;
	btrfs_set_header_nritems(right, nritems);
	data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);

	copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
			   btrfs_item_nr_offset(mid),
			   nritems * sizeof(struct btrfs_item));

	copy_extent_buffer(right, l,
C
Chris Mason 已提交
2765 2766 2767
		     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
		     data_copy_size, btrfs_leaf_data(l) +
		     leaf_data_end(root, l), data_copy_size);
2768

C
Chris Mason 已提交
2769
	rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
2770
		      btrfs_item_end_nr(l, mid);
C
Chris Mason 已提交
2771

2772 2773
	for (i = 0; i < nritems; i++) {
		struct btrfs_item *item = btrfs_item_nr(right, i);
2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784
		u32 ioff;

		if (!right->map_token) {
			map_extent_buffer(right, (unsigned long)item,
					sizeof(struct btrfs_item),
					&right->map_token, &right->kaddr,
					&right->map_start, &right->map_len,
					KM_USER1);
		}

		ioff = btrfs_item_offset(right, item);
2785
		btrfs_set_item_offset(right, item, ioff + rt_data_off);
C
Chris Mason 已提交
2786
	}
C
Chris Mason 已提交
2787

2788 2789 2790 2791 2792
	if (right->map_token) {
		unmap_extent_buffer(right, right->map_token, KM_USER1);
		right->map_token = NULL;
	}

2793
	btrfs_set_header_nritems(l, mid);
C
Chris Mason 已提交
2794
	ret = 0;
2795
	btrfs_item_key(right, &disk_key, 0);
2796 2797
	wret = insert_ptr(trans, root, path, &disk_key, right->start,
			  path->slots[1] + 1, 1);
C
Chris Mason 已提交
2798 2799
	if (wret)
		ret = wret;
2800 2801 2802

	btrfs_mark_buffer_dirty(right);
	btrfs_mark_buffer_dirty(l);
2803
	BUG_ON(path->slots[0] != slot);
2804

Z
Zheng Yan 已提交
2805 2806 2807
	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
	BUG_ON(ret);

2808
	if (mid <= slot) {
2809
		btrfs_tree_unlock(path->nodes[0]);
2810 2811
		free_extent_buffer(path->nodes[0]);
		path->nodes[0] = right;
2812 2813
		path->slots[0] -= mid;
		path->slots[1] += 1;
2814 2815
	} else {
		btrfs_tree_unlock(right);
2816
		free_extent_buffer(right);
2817
	}
2818

2819
	BUG_ON(path->slots[0] < 0);
2820

2821 2822 2823 2824
	if (double_split) {
		BUG_ON(num_doubles != 0);
		num_doubles++;
		goto again;
2825
	}
2826 2827 2828
	return ret;
}

C
Chris Mason 已提交
2829 2830 2831 2832 2833 2834
/*
 * make the item pointed to by the path smaller.  new_size indicates
 * how small to make it, and from_end tells us if we just chop bytes
 * off the end of the item or if we shift the item to chop bytes off
 * the front.
 */
C
Chris Mason 已提交
2835 2836 2837
int btrfs_truncate_item(struct btrfs_trans_handle *trans,
			struct btrfs_root *root,
			struct btrfs_path *path,
2838
			u32 new_size, int from_end)
C
Chris Mason 已提交
2839 2840 2841 2842
{
	int ret = 0;
	int slot;
	int slot_orig;
2843 2844
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
2845 2846 2847 2848 2849 2850 2851 2852
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data_start;
	unsigned int old_size;
	unsigned int size_diff;
	int i;

	slot_orig = path->slots[0];
2853
	leaf = path->nodes[0];
2854 2855 2856 2857 2858
	slot = path->slots[0];

	old_size = btrfs_item_size_nr(leaf, slot);
	if (old_size == new_size)
		return 0;
C
Chris Mason 已提交
2859

2860
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
2861 2862
	data_end = leaf_data_end(root, leaf);

2863
	old_data_start = btrfs_item_offset_nr(leaf, slot);
2864

C
Chris Mason 已提交
2865 2866 2867 2868 2869 2870 2871 2872 2873 2874
	size_diff = old_size - new_size;

	BUG_ON(slot < 0);
	BUG_ON(slot >= nritems);

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2875 2876
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2877 2878 2879 2880 2881 2882 2883 2884 2885

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}

2886 2887
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff + size_diff);
C
Chris Mason 已提交
2888
	}
2889 2890 2891 2892 2893 2894

	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

C
Chris Mason 已提交
2895
	/* shift the data */
2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934
	if (from_end) {
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start + new_size - data_end);
	} else {
		struct btrfs_disk_key disk_key;
		u64 offset;

		btrfs_item_key(leaf, &disk_key, slot);

		if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
			unsigned long ptr;
			struct btrfs_file_extent_item *fi;

			fi = btrfs_item_ptr(leaf, slot,
					    struct btrfs_file_extent_item);
			fi = (struct btrfs_file_extent_item *)(
			     (unsigned long)fi - size_diff);

			if (btrfs_file_extent_type(leaf, fi) ==
			    BTRFS_FILE_EXTENT_INLINE) {
				ptr = btrfs_item_ptr_offset(leaf, slot);
				memmove_extent_buffer(leaf, ptr,
				        (unsigned long)fi,
				        offsetof(struct btrfs_file_extent_item,
						 disk_bytenr));
			}
		}

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end + size_diff, btrfs_leaf_data(leaf) +
			      data_end, old_data_start - data_end);

		offset = btrfs_disk_key_offset(&disk_key);
		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
		btrfs_set_item_key(leaf, &disk_key, slot);
		if (slot == 0)
			fixup_low_keys(trans, root, path, &disk_key, 1);
	}
2935 2936 2937 2938

	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, new_size);
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
2939 2940

	ret = 0;
2941 2942
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
C
Chris Mason 已提交
2943
		BUG();
2944
	}
C
Chris Mason 已提交
2945 2946 2947
	return ret;
}

C
Chris Mason 已提交
2948 2949 2950
/*
 * make the item pointed to by the path bigger, data_size is the new size.
 */
2951 2952 2953
int btrfs_extend_item(struct btrfs_trans_handle *trans,
		      struct btrfs_root *root, struct btrfs_path *path,
		      u32 data_size)
2954 2955 2956 2957
{
	int ret = 0;
	int slot;
	int slot_orig;
2958 2959
	struct extent_buffer *leaf;
	struct btrfs_item *item;
2960 2961 2962 2963 2964 2965 2966
	u32 nritems;
	unsigned int data_end;
	unsigned int old_data;
	unsigned int old_size;
	int i;

	slot_orig = path->slots[0];
2967
	leaf = path->nodes[0];
2968

2969
	nritems = btrfs_header_nritems(leaf);
2970 2971
	data_end = leaf_data_end(root, leaf);

2972 2973
	if (btrfs_leaf_free_space(root, leaf) < data_size) {
		btrfs_print_leaf(root, leaf);
2974
		BUG();
2975
	}
2976
	slot = path->slots[0];
2977
	old_data = btrfs_item_end_nr(leaf, slot);
2978 2979

	BUG_ON(slot < 0);
2980 2981 2982 2983 2984
	if (slot >= nritems) {
		btrfs_print_leaf(root, leaf);
		printk("slot %d too large, nritems %d\n", slot, nritems);
		BUG_ON(1);
	}
2985 2986 2987 2988 2989 2990

	/*
	 * item0..itemN ... dataN.offset..dataN.size .. data0.size
	 */
	/* first correct the data pointers */
	for (i = slot; i < nritems; i++) {
2991 2992
		u32 ioff;
		item = btrfs_item_nr(leaf, i);
2993 2994 2995 2996 2997 2998 2999 3000

		if (!leaf->map_token) {
			map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
		}
3001 3002
		ioff = btrfs_item_offset(leaf, item);
		btrfs_set_item_offset(leaf, item, ioff - data_size);
3003
	}
3004

3005 3006 3007 3008 3009
	if (leaf->map_token) {
		unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
		leaf->map_token = NULL;
	}

3010
	/* shift the data */
3011
	memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3012 3013
		      data_end - data_size, btrfs_leaf_data(leaf) +
		      data_end, old_data - data_end);
3014

3015
	data_end = old_data;
3016 3017 3018 3019
	old_size = btrfs_item_size_nr(leaf, slot);
	item = btrfs_item_nr(leaf, slot);
	btrfs_set_item_size(leaf, item, old_size + data_size);
	btrfs_mark_buffer_dirty(leaf);
3020 3021

	ret = 0;
3022 3023
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3024
		BUG();
3025
	}
3026 3027 3028
	return ret;
}

3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177
/*
 * Given a key and some data, insert items into the tree.
 * This does all the path init required, making room in the tree if needed.
 * Returns the number of keys that were inserted.
 */
int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
{
	struct extent_buffer *leaf;
	struct btrfs_item *item;
	int ret = 0;
	int slot;
	int i;
	u32 nritems;
	u32 total_data = 0;
	u32 total_size = 0;
	unsigned int data_end;
	struct btrfs_disk_key disk_key;
	struct btrfs_key found_key;

	found_key.objectid = 0;
	nr = min_t(int, nr, BTRFS_NODEPTRS_PER_BLOCK(root));

	for (i = 0; i < nr; i++)
		total_data += data_size[i];

	total_data = min_t(u32, total_data, BTRFS_LEAF_DATA_SIZE(root));
	total_size = total_data + (nr * sizeof(struct btrfs_item));
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
	if (ret == 0)
		return -EEXIST;
	if (ret < 0)
		goto out;

	leaf = path->nodes[0];

	nritems = btrfs_header_nritems(leaf);
	data_end = leaf_data_end(root, leaf);

	if (btrfs_leaf_free_space(root, leaf) < total_size) {
		for (i = nr; i >= 0; i--) {
			total_data -= data_size[i];
			total_size -= data_size[i] + sizeof(struct btrfs_item);
			if (total_size < btrfs_leaf_free_space(root, leaf))
				break;
		}
		nr = i;
	}

	slot = path->slots[0];
	BUG_ON(slot < 0);

	if (slot != nritems) {
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);

		item = btrfs_item_nr(leaf, slot);
		btrfs_item_key_to_cpu(leaf, &found_key, slot);

		/* figure out how many keys we can insert in here */
		total_data = data_size[0];
		for (i = 1; i < nr; i++) {
			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
				break;
			total_data += data_size[i];
		}
		nr = i;

		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d old_data %d data_end %d\n",
			       slot, old_data, data_end);
			BUG_ON(1);
		}
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
		WARN_ON(leaf->map_token);
		for (i = slot; i < nritems; i++) {
			u32 ioff;

			item = btrfs_item_nr(leaf, i);
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff - total_data);
		}
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

		/* shift the items */
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
			      btrfs_item_nr_offset(slot),
			      (nritems - slot) * sizeof(struct btrfs_item));

		/* shift the data */
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
			      data_end - total_data, btrfs_leaf_data(leaf) +
			      data_end, old_data - data_end);
		data_end = old_data;
	} else {
		/*
		 * this sucks but it has to be done, if we are inserting at
		 * the end of the leaf only insert 1 of the items, since we
		 * have no way of knowing whats on the next leaf and we'd have
		 * to drop our current locks to figure it out
		 */
		nr = 1;
	}

	/* setup the item for the new data */
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
	btrfs_mark_buffer_dirty(leaf);

	ret = 0;
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
	}

	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
		BUG();
	}
out:
	if (!ret)
		ret = nr;
	return ret;
}

C
Chris Mason 已提交
3178
/*
C
Chris Mason 已提交
3179
 * Given a key and some data, insert items into the tree.
C
Chris Mason 已提交
3180 3181
 * This does all the path init required, making room in the tree if needed.
 */
3182
int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3183 3184
			    struct btrfs_root *root,
			    struct btrfs_path *path,
3185 3186
			    struct btrfs_key *cpu_key, u32 *data_size,
			    int nr)
3187
{
3188 3189
	struct extent_buffer *leaf;
	struct btrfs_item *item;
C
Chris Mason 已提交
3190
	int ret = 0;
3191
	int slot;
3192
	int slot_orig;
3193
	int i;
3194
	u32 nritems;
3195 3196
	u32 total_size = 0;
	u32 total_data = 0;
3197
	unsigned int data_end;
C
Chris Mason 已提交
3198 3199
	struct btrfs_disk_key disk_key;

3200 3201 3202
	for (i = 0; i < nr; i++) {
		total_data += data_size[i];
	}
3203

3204
	total_size = total_data + (nr * sizeof(struct btrfs_item));
3205
	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
J
Josef Bacik 已提交
3206
	if (ret == 0)
3207
		return -EEXIST;
3208 3209
	if (ret < 0)
		goto out;
3210

3211
	slot_orig = path->slots[0];
3212
	leaf = path->nodes[0];
C
Chris Mason 已提交
3213

3214
	nritems = btrfs_header_nritems(leaf);
C
Chris Mason 已提交
3215
	data_end = leaf_data_end(root, leaf);
3216

3217
	if (btrfs_leaf_free_space(root, leaf) < total_size) {
3218 3219
		btrfs_print_leaf(root, leaf);
		printk("not enough freespace need %u have %d\n",
3220
		       total_size, btrfs_leaf_free_space(root, leaf));
3221
		BUG();
3222
	}
3223

3224
	slot = path->slots[0];
3225
	BUG_ON(slot < 0);
3226

3227
	if (slot != nritems) {
3228
		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
3229

3230 3231 3232 3233 3234 3235
		if (old_data < data_end) {
			btrfs_print_leaf(root, leaf);
			printk("slot %d old_data %d data_end %d\n",
			       slot, old_data, data_end);
			BUG_ON(1);
		}
3236 3237 3238 3239
		/*
		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
		 */
		/* first correct the data pointers */
3240
		WARN_ON(leaf->map_token);
C
Chris Mason 已提交
3241
		for (i = slot; i < nritems; i++) {
3242
			u32 ioff;
3243

3244
			item = btrfs_item_nr(leaf, i);
3245 3246 3247 3248 3249 3250 3251 3252
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}

3253
			ioff = btrfs_item_offset(leaf, item);
3254
			btrfs_set_item_offset(leaf, item, ioff - total_data);
C
Chris Mason 已提交
3255
		}
3256 3257 3258 3259
		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}
3260 3261

		/* shift the items */
3262
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3263
			      btrfs_item_nr_offset(slot),
C
Chris Mason 已提交
3264
			      (nritems - slot) * sizeof(struct btrfs_item));
3265 3266

		/* shift the data */
3267
		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3268
			      data_end - total_data, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3269
			      data_end, old_data - data_end);
3270 3271
		data_end = old_data;
	}
3272

3273
	/* setup the item for the new data */
3274 3275 3276 3277 3278 3279 3280 3281 3282
	for (i = 0; i < nr; i++) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
		btrfs_set_item_key(leaf, &disk_key, slot + i);
		item = btrfs_item_nr(leaf, slot + i);
		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
		data_end -= data_size[i];
		btrfs_set_item_size(leaf, item, data_size[i]);
	}
	btrfs_set_header_nritems(leaf, nritems + nr);
3283
	btrfs_mark_buffer_dirty(leaf);
C
Chris Mason 已提交
3284 3285

	ret = 0;
3286 3287
	if (slot == 0) {
		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3288
		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3289
	}
C
Chris Mason 已提交
3290

3291 3292
	if (btrfs_leaf_free_space(root, leaf) < 0) {
		btrfs_print_leaf(root, leaf);
3293
		BUG();
3294
	}
3295
out:
3296 3297 3298 3299 3300 3301 3302
	return ret;
}

/*
 * Given a key and some data, insert an item into the tree.
 * This does all the path init required, making room in the tree if needed.
 */
3303 3304 3305
int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
		      *root, struct btrfs_key *cpu_key, void *data, u32
		      data_size)
3306 3307
{
	int ret = 0;
C
Chris Mason 已提交
3308
	struct btrfs_path *path;
3309 3310
	struct extent_buffer *leaf;
	unsigned long ptr;
3311

C
Chris Mason 已提交
3312 3313 3314
	path = btrfs_alloc_path();
	BUG_ON(!path);
	ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3315
	if (!ret) {
3316 3317 3318 3319
		leaf = path->nodes[0];
		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
		write_extent_buffer(leaf, data, ptr, data_size);
		btrfs_mark_buffer_dirty(leaf);
3320
	}
C
Chris Mason 已提交
3321
	btrfs_free_path(path);
C
Chris Mason 已提交
3322
	return ret;
3323 3324
}

C
Chris Mason 已提交
3325
/*
C
Chris Mason 已提交
3326
 * delete the pointer from a given node.
C
Chris Mason 已提交
3327
 *
C
Chris Mason 已提交
3328 3329
 * the tree should have been previously balanced so the deletion does not
 * empty a node.
C
Chris Mason 已提交
3330
 */
3331 3332
static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		   struct btrfs_path *path, int level, int slot)
3333
{
3334
	struct extent_buffer *parent = path->nodes[level];
3335
	u32 nritems;
C
Chris Mason 已提交
3336
	int ret = 0;
3337
	int wret;
3338

3339
	nritems = btrfs_header_nritems(parent);
3340
	if (slot != nritems -1) {
3341 3342 3343
		memmove_extent_buffer(parent,
			      btrfs_node_key_ptr_offset(slot),
			      btrfs_node_key_ptr_offset(slot + 1),
C
Chris Mason 已提交
3344 3345
			      sizeof(struct btrfs_key_ptr) *
			      (nritems - slot - 1));
3346
	}
3347
	nritems--;
3348
	btrfs_set_header_nritems(parent, nritems);
3349
	if (nritems == 0 && parent == root->node) {
3350
		BUG_ON(btrfs_header_level(root->node) != 1);
3351
		/* just turn the root into a leaf and break */
3352
		btrfs_set_header_level(root->node, 0);
3353
	} else if (slot == 0) {
3354 3355 3356 3357
		struct btrfs_disk_key disk_key;

		btrfs_node_key(parent, &disk_key, 0);
		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
C
Chris Mason 已提交
3358 3359
		if (wret)
			ret = wret;
3360
	}
C
Chris Mason 已提交
3361
	btrfs_mark_buffer_dirty(parent);
C
Chris Mason 已提交
3362
	return ret;
3363 3364
}

3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391
/*
 * a helper function to delete the leaf pointed to by path->slots[1] and
 * path->nodes[1].  bytenr is the node block pointer, but since the callers
 * already know it, it is faster to have them pass it down than to
 * read it out of the node again.
 *
 * This deletes the pointer in path->nodes[1] and frees the leaf
 * block extent.  zero is returned if it all worked out, < 0 otherwise.
 *
 * The path must have already been setup for deleting the leaf, including
 * all the proper balancing.  path->nodes[1] must be locked.
 */
noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
			    struct btrfs_root *root,
			    struct btrfs_path *path, u64 bytenr)
{
	int ret;
	u64 root_gen = btrfs_header_generation(path->nodes[1]);

	ret = del_ptr(trans, root, path, 1, path->slots[1]);
	if (ret)
		return ret;

	ret = btrfs_free_extent(trans, root, bytenr,
				btrfs_level_size(root, 0),
				path->nodes[1]->start,
				btrfs_header_owner(path->nodes[1]),
3392
				root_gen, 0, 1);
3393 3394
	return ret;
}
C
Chris Mason 已提交
3395 3396 3397 3398
/*
 * delete the item at the leaf level in path.  If that empties
 * the leaf, remove it from the tree
 */
3399 3400
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
		    struct btrfs_path *path, int slot, int nr)
3401
{
3402 3403
	struct extent_buffer *leaf;
	struct btrfs_item *item;
3404 3405
	int last_off;
	int dsize = 0;
C
Chris Mason 已提交
3406 3407
	int ret = 0;
	int wret;
3408
	int i;
3409
	u32 nritems;
3410

3411
	leaf = path->nodes[0];
3412 3413 3414 3415 3416
	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);

	for (i = 0; i < nr; i++)
		dsize += btrfs_item_size_nr(leaf, slot + i);

3417
	nritems = btrfs_header_nritems(leaf);
3418

3419
	if (slot + nr != nritems) {
C
Chris Mason 已提交
3420
		int data_end = leaf_data_end(root, leaf);
3421 3422

		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
C
Chris Mason 已提交
3423 3424
			      data_end + dsize,
			      btrfs_leaf_data(leaf) + data_end,
3425
			      last_off - data_end);
3426

3427
		for (i = slot + nr; i < nritems; i++) {
3428
			u32 ioff;
3429

3430
			item = btrfs_item_nr(leaf, i);
3431 3432 3433 3434 3435 3436 3437
			if (!leaf->map_token) {
				map_extent_buffer(leaf, (unsigned long)item,
					sizeof(struct btrfs_item),
					&leaf->map_token, &leaf->kaddr,
					&leaf->map_start, &leaf->map_len,
					KM_USER1);
			}
3438 3439
			ioff = btrfs_item_offset(leaf, item);
			btrfs_set_item_offset(leaf, item, ioff + dsize);
C
Chris Mason 已提交
3440
		}
3441 3442 3443 3444 3445 3446

		if (leaf->map_token) {
			unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
			leaf->map_token = NULL;
		}

3447
		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3448
			      btrfs_item_nr_offset(slot + nr),
C
Chris Mason 已提交
3449
			      sizeof(struct btrfs_item) *
3450
			      (nritems - slot - nr));
3451
	}
3452 3453
	btrfs_set_header_nritems(leaf, nritems - nr);
	nritems -= nr;
3454

C
Chris Mason 已提交
3455
	/* delete the leaf if we've emptied it */
3456
	if (nritems == 0) {
3457 3458
		if (leaf == root->node) {
			btrfs_set_header_level(leaf, 0);
3459
		} else {
3460 3461
			ret = btrfs_del_leaf(trans, root, path, leaf->start);
			BUG_ON(ret);
3462
		}
3463
	} else {
3464
		int used = leaf_space_used(leaf, 0, nritems);
C
Chris Mason 已提交
3465
		if (slot == 0) {
3466 3467 3468
			struct btrfs_disk_key disk_key;

			btrfs_item_key(leaf, &disk_key, 0);
3469
			wret = fixup_low_keys(trans, root, path,
3470
					      &disk_key, 1);
C
Chris Mason 已提交
3471 3472 3473 3474
			if (wret)
				ret = wret;
		}

C
Chris Mason 已提交
3475
		/* delete the leaf if it is mostly empty */
3476
		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
3477 3478 3479 3480
			/* push_leaf_left fixes the path.
			 * make sure the path still points to our leaf
			 * for possible call to del_ptr below
			 */
3481
			slot = path->slots[1];
3482 3483
			extent_buffer_get(leaf);

3484
			wret = push_leaf_left(trans, root, path, 1, 1);
3485
			if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3486
				ret = wret;
3487 3488 3489

			if (path->nodes[0] == leaf &&
			    btrfs_header_nritems(leaf)) {
3490
				wret = push_leaf_right(trans, root, path, 1, 1);
3491
				if (wret < 0 && wret != -ENOSPC)
C
Chris Mason 已提交
3492 3493
					ret = wret;
			}
3494 3495

			if (btrfs_header_nritems(leaf) == 0) {
3496 3497 3498
				path->slots[1] = slot;
				ret = btrfs_del_leaf(trans, root, path, leaf->start);
				BUG_ON(ret);
3499
				free_extent_buffer(leaf);
C
Chris Mason 已提交
3500
			} else {
3501 3502 3503 3504 3505 3506 3507
				/* if we're still in the path, make sure
				 * we're dirty.  Otherwise, one of the
				 * push_leaf functions must have already
				 * dirtied this buffer
				 */
				if (path->nodes[0] == leaf)
					btrfs_mark_buffer_dirty(leaf);
3508
				free_extent_buffer(leaf);
3509
			}
3510
		} else {
3511
			btrfs_mark_buffer_dirty(leaf);
3512 3513
		}
	}
C
Chris Mason 已提交
3514
	return ret;
3515 3516
}

3517
/*
3518
 * search the tree again to find a leaf with lesser keys
3519 3520
 * returns 0 if it found something or 1 if there are no lesser leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3521 3522 3523
 *
 * This may release the path, and so you may lose any locks held at the
 * time you call it.
3524 3525 3526
 */
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
3527 3528 3529
	struct btrfs_key key;
	struct btrfs_disk_key found_key;
	int ret;
3530

3531
	btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
3532

3533 3534 3535 3536 3537 3538 3539 3540
	if (key.offset > 0)
		key.offset--;
	else if (key.type > 0)
		key.type--;
	else if (key.objectid > 0)
		key.objectid--;
	else
		return 1;
3541

3542 3543 3544 3545 3546 3547 3548 3549 3550
	btrfs_release_path(root, path);
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		return ret;
	btrfs_item_key(path->nodes[0], &found_key, 0);
	ret = comp_keys(&found_key, &key);
	if (ret < 0)
		return 0;
	return 1;
3551 3552
}

3553 3554 3555
/*
 * A helper function to walk down the tree starting at min_key, and looking
 * for nodes or leaves that are either in cache or have a minimum
C
Chris Mason 已提交
3556
 * transaction id.  This is used by the btree defrag code, and tree logging
3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
 * key and get a writable path.
 *
 * This does lock as it descends, and path->keep_locks should be set
 * to 1 by the caller.
 *
 * This honors path->lowest_level to prevent descent past a given level
 * of the tree.
 *
C
Chris Mason 已提交
3568 3569 3570 3571
 * min_trans indicates the oldest transaction that you are interested
 * in walking through.  Any nodes or leaves older than min_trans are
 * skipped over (without reading them).
 *
3572 3573 3574 3575
 * returns zero if something useful was found, < 0 on error and 1 if there
 * was nothing in the tree that matched the search criteria.
 */
int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
3576
			 struct btrfs_key *max_key,
3577 3578 3579 3580 3581 3582
			 struct btrfs_path *path, int cache_only,
			 u64 min_trans)
{
	struct extent_buffer *cur;
	struct btrfs_key found_key;
	int slot;
3583
	int sret;
3584 3585 3586 3587 3588 3589 3590
	u32 nritems;
	int level;
	int ret = 1;

again:
	cur = btrfs_lock_root_node(root);
	level = btrfs_header_level(cur);
3591
	WARN_ON(path->nodes[level]);
3592 3593 3594 3595 3596 3597 3598 3599 3600 3601
	path->nodes[level] = cur;
	path->locks[level] = 1;

	if (btrfs_header_generation(cur) < min_trans) {
		ret = 1;
		goto out;
	}
	while(1) {
		nritems = btrfs_header_nritems(cur);
		level = btrfs_header_level(cur);
3602
		sret = bin_search(cur, min_key, level, &slot);
3603

3604 3605
		/* at the lowest level, we're done, setup the path and exit */
		if (level == path->lowest_level) {
3606 3607
			if (slot >= nritems)
				goto find_next_key;
3608 3609 3610 3611 3612
			ret = 0;
			path->slots[level] = slot;
			btrfs_item_key_to_cpu(cur, &found_key, slot);
			goto out;
		}
3613 3614
		if (sret && slot > 0)
			slot--;
3615 3616 3617 3618 3619 3620 3621 3622 3623
		/*
		 * check this node pointer against the cache_only and
		 * min_trans parameters.  If it isn't in cache or is too
		 * old, skip to the next one.
		 */
		while(slot < nritems) {
			u64 blockptr;
			u64 gen;
			struct extent_buffer *tmp;
3624 3625
			struct btrfs_disk_key disk_key;

3626 3627 3628 3629 3630 3631 3632 3633 3634
			blockptr = btrfs_node_blockptr(cur, slot);
			gen = btrfs_node_ptr_generation(cur, slot);
			if (gen < min_trans) {
				slot++;
				continue;
			}
			if (!cache_only)
				break;

3635 3636 3637 3638 3639 3640 3641 3642
			if (max_key) {
				btrfs_node_key(cur, &disk_key, slot);
				if (comp_keys(&disk_key, max_key) >= 0) {
					ret = 1;
					goto out;
				}
			}

3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653
			tmp = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));

			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
				free_extent_buffer(tmp);
				break;
			}
			if (tmp)
				free_extent_buffer(tmp);
			slot++;
		}
3654
find_next_key:
3655 3656 3657 3658 3659
		/*
		 * we didn't find a candidate key in this node, walk forward
		 * and find another one
		 */
		if (slot >= nritems) {
3660 3661
			path->slots[level] = slot;
			sret = btrfs_find_next_key(root, path, min_key, level,
3662
						  cache_only, min_trans);
3663
			if (sret == 0) {
3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702
				btrfs_release_path(root, path);
				goto again;
			} else {
				goto out;
			}
		}
		/* save our key for returning back */
		btrfs_node_key_to_cpu(cur, &found_key, slot);
		path->slots[level] = slot;
		if (level == path->lowest_level) {
			ret = 0;
			unlock_up(path, level, 1);
			goto out;
		}
		cur = read_node_slot(root, cur, slot);

		btrfs_tree_lock(cur);
		path->locks[level - 1] = 1;
		path->nodes[level - 1] = cur;
		unlock_up(path, level, 1);
	}
out:
	if (ret == 0)
		memcpy(min_key, &found_key, sizeof(found_key));
	return ret;
}

/*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
 * tree based on the current path and the cache_only and min_trans
 * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
 *
 * path->keep_locks should be set to 1 on the search made before
 * calling this function.
 */
3703
int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
3704 3705
			struct btrfs_key *key, int lowest_level,
			int cache_only, u64 min_trans)
3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716
{
	int level = lowest_level;
	int slot;
	struct extent_buffer *c;

	while(level < BTRFS_MAX_LEVEL) {
		if (!path->nodes[level])
			return 1;

		slot = path->slots[level] + 1;
		c = path->nodes[level];
3717
next:
3718 3719 3720 3721 3722 3723 3724 3725 3726
		if (slot >= btrfs_header_nritems(c)) {
			level++;
			if (level == BTRFS_MAX_LEVEL) {
				return 1;
			}
			continue;
		}
		if (level == 0)
			btrfs_item_key_to_cpu(c, key, slot);
3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746
		else {
			u64 blockptr = btrfs_node_blockptr(c, slot);
			u64 gen = btrfs_node_ptr_generation(c, slot);

			if (cache_only) {
				struct extent_buffer *cur;
				cur = btrfs_find_tree_block(root, blockptr,
					    btrfs_level_size(root, level - 1));
				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
					slot++;
					if (cur)
						free_extent_buffer(cur);
					goto next;
				}
				free_extent_buffer(cur);
			}
			if (gen < min_trans) {
				slot++;
				goto next;
			}
3747
			btrfs_node_key_to_cpu(c, key, slot);
3748
		}
3749 3750 3751 3752 3753
		return 0;
	}
	return 1;
}

C
Chris Mason 已提交
3754
/*
3755
 * search the tree again to find a leaf with greater keys
C
Chris Mason 已提交
3756 3757
 * returns 0 if it found something or 1 if there are no greater leaves.
 * returns < 0 on io errors.
C
Chris Mason 已提交
3758
 */
C
Chris Mason 已提交
3759
int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
3760 3761 3762
{
	int slot;
	int level = 1;
3763 3764
	struct extent_buffer *c;
	struct extent_buffer *next = NULL;
3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776
	struct btrfs_key key;
	u32 nritems;
	int ret;

	nritems = btrfs_header_nritems(path->nodes[0]);
	if (nritems == 0) {
		return 1;
	}

	btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);

	btrfs_release_path(root, path);
3777
	path->keep_locks = 1;
3778 3779 3780 3781 3782 3783
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	path->keep_locks = 0;

	if (ret < 0)
		return ret;

3784
	nritems = btrfs_header_nritems(path->nodes[0]);
3785 3786 3787 3788 3789 3790
	/*
	 * by releasing the path above we dropped all our locks.  A balance
	 * could have added more items next to the key that used to be
	 * at the very end of the block.  So, check again here and
	 * advance the path if there are now more items available.
	 */
3791
	if (nritems > 0 && path->slots[0] < nritems - 1) {
3792
		path->slots[0]++;
3793 3794
		goto done;
	}
3795

C
Chris Mason 已提交
3796
	while(level < BTRFS_MAX_LEVEL) {
3797
		if (!path->nodes[level])
C
Chris Mason 已提交
3798
			return 1;
3799

3800 3801
		slot = path->slots[level] + 1;
		c = path->nodes[level];
3802
		if (slot >= btrfs_header_nritems(c)) {
3803
			level++;
3804
			if (level == BTRFS_MAX_LEVEL) {
3805
				return 1;
3806
			}
3807 3808
			continue;
		}
3809

3810 3811
		if (next) {
			btrfs_tree_unlock(next);
3812
			free_extent_buffer(next);
3813
		}
3814

3815 3816
		if (level == 1 && (path->locks[1] || path->skip_locking) &&
		    path->reada)
3817
			reada_for_search(root, path, level, slot, 0);
3818

3819
		next = read_node_slot(root, c, slot);
3820 3821 3822 3823
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(c));
			btrfs_tree_lock(next);
		}
3824 3825 3826 3827 3828 3829
		break;
	}
	path->slots[level] = slot;
	while(1) {
		level--;
		c = path->nodes[level];
3830 3831
		if (path->locks[level])
			btrfs_tree_unlock(c);
3832
		free_extent_buffer(c);
3833 3834
		path->nodes[level] = next;
		path->slots[level] = 0;
3835 3836
		if (!path->skip_locking)
			path->locks[level] = 1;
3837 3838
		if (!level)
			break;
3839 3840
		if (level == 1 && path->locks[1] && path->reada)
			reada_for_search(root, path, level, slot, 0);
3841
		next = read_node_slot(root, next, 0);
3842 3843 3844 3845
		if (!path->skip_locking) {
			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
			btrfs_tree_lock(next);
		}
3846
	}
3847 3848
done:
	unlock_up(path, 0, 1);
3849 3850
	return 0;
}
3851

3852 3853 3854 3855 3856 3857
/*
 * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
 * searching until it gets past min_objectid or finds an item of 'type'
 *
 * returns 0 if something is found, 1 if nothing was found and < 0 on error
 */
3858 3859 3860 3861 3862 3863
int btrfs_previous_item(struct btrfs_root *root,
			struct btrfs_path *path, u64 min_objectid,
			int type)
{
	struct btrfs_key found_key;
	struct extent_buffer *leaf;
3864
	u32 nritems;
3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875
	int ret;

	while(1) {
		if (path->slots[0] == 0) {
			ret = btrfs_prev_leaf(root, path);
			if (ret != 0)
				return ret;
		} else {
			path->slots[0]--;
		}
		leaf = path->nodes[0];
3876 3877 3878 3879 3880 3881
		nritems = btrfs_header_nritems(leaf);
		if (nritems == 0)
			return 1;
		if (path->slots[0] == nritems)
			path->slots[0]--;

3882 3883 3884
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		if (found_key.type == type)
			return 0;
3885 3886 3887 3888 3889
		if (found_key.objectid < min_objectid)
			break;
		if (found_key.objectid == min_objectid &&
		    found_key.type < type)
			break;
3890 3891 3892
	}
	return 1;
}